From 6bf794a02a509bf051c50fe2f14009e117ef50ed Mon Sep 17 00:00:00 2001 From: Daniil Fukalov Date: Wed, 1 Apr 2026 11:55:17 +0200 Subject: [PATCH] [AMDGPU] Disable generic DAG combines at -O0 to preserve debuggability. (#176304) Disable generic DAG combines for AMDGPU at -O0 via disableGenericCombines() to preserve instructions that users may want to set breakpoints on during debugging. Assisted-by: Cursor / Claude Opus 4.6 --- clang/test/CodeGenCUDA/fp-contract.cu | 10 +- .../Target/AMDGPU/AMDGPUSelectionDAGInfo.h | 7 + llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll | 112 +- llvm/test/CodeGen/AMDGPU/basic-branch.ll | 11 +- .../AMDGPU/bb-prolog-spill-during-regalloc.ll | 4 +- .../CodeGen/AMDGPU/cf-loop-on-constant.ll | 66 +- llvm/test/CodeGen/AMDGPU/collapse-endcf.ll | 140 +- .../AMDGPU/control-flow-fastregalloc.ll | 4 +- .../CodeGen/AMDGPU/dagcombine-lshr-and-cmp.ll | 62 +- llvm/test/CodeGen/AMDGPU/div_i128.ll | 1338 ++- .../CodeGen/AMDGPU/extract_vector_dynelt.ll | 1183 +- .../flat-scratch-alloca-issue-155902.ll | 200 + .../CodeGen/AMDGPU/indirect-addressing-si.ll | 1570 ++- .../AMDGPU/indirect-addressing-term.ll | 1 + .../CodeGen/AMDGPU/insert-delay-alu-bug.ll | 101 +- .../CodeGen/AMDGPU/insert_vector_dynelt.ll | 7734 ++++++++----- .../kernel-vgpr-spill-mubuf-with-voffset.ll | 22 +- ...llvm.amdgcn.ds.gws.barrier-fastregalloc.ll | 1 + .../CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll | 322 +- llvm/test/CodeGen/AMDGPU/llvm.dbg.value.ll | 6 +- .../CodeGen/AMDGPU/load-global-invariant.ll | 10 + .../AMDGPU/memory-legalizer-flat-agent.ll | 7544 ++++++++++++- .../AMDGPU/memory-legalizer-flat-cluster.ll | 7544 ++++++++++++- .../AMDGPU/memory-legalizer-flat-lastuse.ll | 61 +- .../memory-legalizer-flat-nontemporal.ll | 704 +- .../memory-legalizer-flat-singlethread.ll | 7608 +++++++++++-- .../AMDGPU/memory-legalizer-flat-system.ll | 7544 ++++++++++++- .../AMDGPU/memory-legalizer-flat-volatile.ll | 478 +- .../AMDGPU/memory-legalizer-flat-wavefront.ll | 7530 +++++++++++-- .../AMDGPU/memory-legalizer-flat-workgroup.ll | 7210 +++++++++++- .../AMDGPU/memory-legalizer-global-agent.ll | 7223 +++++++++++- .../AMDGPU/memory-legalizer-global-cluster.ll | 7223 +++++++++++- .../AMDGPU/memory-legalizer-global-lastuse.ll | 104 +- .../memory-legalizer-global-nontemporal.ll | 910 +- .../memory-legalizer-global-singlethread.ll | 7348 +++++++++++- .../AMDGPU/memory-legalizer-global-system.ll | 6879 +++++++++++- .../memory-legalizer-global-volatile.ll | 673 +- .../memory-legalizer-global-wavefront.ll | 7348 +++++++++++- .../memory-legalizer-global-workgroup.ll | 7313 +++++++++++- .../AMDGPU/memory-legalizer-local-agent.ll | 8488 ++++++++++++-- .../AMDGPU/memory-legalizer-local-cluster.ll | 8488 ++++++++++++-- .../memory-legalizer-local-nontemporal.ll | 289 +- .../memory-legalizer-local-singlethread.ll | 8524 ++++++++++++-- .../AMDGPU/memory-legalizer-local-system.ll | 8488 ++++++++++++-- .../AMDGPU/memory-legalizer-local-volatile.ll | 258 +- .../memory-legalizer-local-wavefront.ll | 8524 ++++++++++++-- .../memory-legalizer-local-workgroup.ll | 8488 ++++++++++++-- .../AMDGPU/memory-legalizer-private-agent.ll | 9804 +++++++++++++--- .../memory-legalizer-private-cluster.ll | 9804 +++++++++++++--- .../memory-legalizer-private-lastuse.ll | 48 +- .../memory-legalizer-private-nontemporal.ll | 415 +- .../memory-legalizer-private-singlethread.ll | 9910 ++++++++++++++--- .../AMDGPU/memory-legalizer-private-system.ll | 9384 +++++++++++++--- .../memory-legalizer-private-volatile.ll | 232 +- .../memory-legalizer-private-wavefront.ll | 9910 ++++++++++++++--- .../memory-legalizer-private-workgroup.ll | 9910 ++++++++++++++--- .../AMDGPU/partial-sgpr-to-vgpr-spills.ll | 46 +- llvm/test/CodeGen/AMDGPU/rem_i128.ll | 2046 ++-- .../AMDGPU/rewrite-vgpr-mfma-to-agpr.ll | 12 +- .../CodeGen/AMDGPU/scalar_to_vector_v2x16.ll | 10 +- llvm/test/CodeGen/AMDGPU/select-phi-s16-fp.ll | 8 +- .../CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll | 12 +- .../AMDGPU/sgpr-spills-split-regalloc.ll | 16 +- .../AMDGPU/smfmac_alloc_failure_no_agpr_O0.ll | 119 +- llvm/test/CodeGen/AMDGPU/sopk-no-literal.ll | 5 +- llvm/test/CodeGen/AMDGPU/spill-m0.ll | 2 +- .../spill-vgpr-to-agpr-update-regscavenger.ll | 16 +- llvm/test/CodeGen/AMDGPU/spill-wide-sgpr.ll | 28 +- .../CodeGen/AMDGPU/stacksave_stackrestore.ll | 62 +- llvm/test/CodeGen/AMDGPU/trap-abis.ll | 10 + .../test/CodeGen/AMDGPU/vgpr-limit-gfx1250.ll | 8 +- .../AMDGPU/vgpr-spill-placement-issue61083.ll | 4 +- .../CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll | 2 + .../wait-xcnt-atomic-rmw-optimization.ll | 117 +- .../test/CodeGen/AMDGPU/wwm-reserved-spill.ll | 56 +- llvm/test/CodeGen/AMDGPU/wwm-reserved.ll | 114 +- llvm/test/CodeGen/AMDGPU/zext-lid.ll | 8 +- 77 files changed, 191621 insertions(+), 26192 deletions(-) diff --git a/clang/test/CodeGenCUDA/fp-contract.cu b/clang/test/CodeGenCUDA/fp-contract.cu index d6c796a817cb..e5fec7bda1d5 100644 --- a/clang/test/CodeGenCUDA/fp-contract.cu +++ b/clang/test/CodeGenCUDA/fp-contract.cu @@ -15,7 +15,7 @@ // RUN: -disable-llvm-passes -o - %s \ // RUN: | FileCheck -check-prefixes=COMMON,NV-ON %s // RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \ -// RUN: -target-cpu gfx906 -disable-llvm-passes -o - -x hip %s \ +// RUN: -O1 -target-cpu gfx906 -disable-llvm-passes -o - -x hip %s \ // RUN: | FileCheck -check-prefixes=COMMON,AMD-ON %s // RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \ // RUN: -O3 -o - %s \ @@ -44,7 +44,7 @@ // RUN: -ffp-contract=fast -disable-llvm-passes -o - %s \ // RUN: | FileCheck -check-prefixes=COMMON,NV-ON %s // RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \ -// RUN: -target-cpu gfx906 -disable-llvm-passes -o - -x hip %s \ +// RUN: -O1 -target-cpu gfx906 -disable-llvm-passes -o - -x hip %s \ // RUN: -ffp-contract=fast \ // RUN: | FileCheck -check-prefixes=COMMON,AMD-ON %s // RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \ @@ -79,7 +79,7 @@ // RUN: -ffp-contract=fast-honor-pragmas -disable-llvm-passes -o - %s \ // RUN: | FileCheck -check-prefixes=COMMON,NV-ON %s // RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \ -// RUN: -target-cpu gfx906 -disable-llvm-passes -o - -x hip %s \ +// RUN: -O1 -target-cpu gfx906 -disable-llvm-passes -o - -x hip %s \ // RUN: -ffp-contract=fast-honor-pragmas \ // RUN: | FileCheck -check-prefixes=COMMON,AMD-ON %s // RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \ @@ -117,7 +117,7 @@ // RUN: -ffp-contract=on -disable-llvm-passes -o - %s \ // RUN: | FileCheck -check-prefixes=COMMON,NV-ON %s // RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \ -// RUN: -target-cpu gfx906 -disable-llvm-passes -o - -x hip %s \ +// RUN: -O1 -target-cpu gfx906 -disable-llvm-passes -o - -x hip %s \ // RUN: -ffp-contract=on \ // RUN: | FileCheck -check-prefixes=COMMON,AMD-ON %s // RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \ @@ -180,7 +180,7 @@ __host__ __device__ float func(float a, float b, float c) { return a + b * c; } // COMMON-LABEL: _Z4funcfff // NV-ON: fma.rn.f32 // NV-ON-NEXT: st.param.b32 -// AMD-ON: v_fmac_f32_e64 +// AMD-ON: v_fmac_f32_e32 // AMD-ON-NEXT: s_setpc_b64 // NV-OFF: mul.rn.f32 diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSelectionDAGInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUSelectionDAGInfo.h index bae614a20039..8b4ba10951ea 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSelectionDAGInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSelectionDAGInfo.h @@ -50,6 +50,13 @@ public: ~AMDGPUSelectionDAGInfo() override; + bool disableGenericCombines(CodeGenOptLevel OptLevel) const override { + // Disable generic DAG combines at -O0 to preserve debuggability. + // This prevents optimizations like constant reassociation that would + // eliminate intermediate instructions users want to step through. + return OptLevel == CodeGenOptLevel::None; + } + const char *getTargetNodeName(unsigned Opcode) const override; void verifyTargetNode(const SelectionDAG &DAG, diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll index ae42404fd381..d802e5565c5d 100644 --- a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll +++ b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll @@ -458,18 +458,15 @@ define i64 @optnone_atomicrmw_add_i64_expand(i64 %val) #1 { ; GFX908-NEXT: s_add_u32 s6, s6, global@rel32@lo+4 ; GFX908-NEXT: s_addc_u32 s7, s7, global@rel32@hi+12 ; GFX908-NEXT: s_cmp_eq_u32 s7, s5 -; GFX908-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX908-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] +; GFX908-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX908-NEXT: s_mov_b64 s[4:5], -1 -; GFX908-NEXT: s_mov_b32 s6, 1 -; GFX908-NEXT: v_cmp_ne_u32_e64 s[6:7], v2, s6 +; GFX908-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5] ; GFX908-NEXT: s_and_b64 vcc, exec, s[6:7] ; GFX908-NEXT: ; implicit-def: $vgpr3_vgpr4 ; GFX908-NEXT: s_cbranch_vccnz .LBB4_3 ; GFX908-NEXT: .LBB4_1: ; %Flow -; GFX908-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] -; GFX908-NEXT: s_mov_b32 s4, 1 -; GFX908-NEXT: v_cmp_ne_u32_e64 s[4:5], v2, s4 +; GFX908-NEXT: s_mov_b64 s[6:7], -1 +; GFX908-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7] ; GFX908-NEXT: s_and_b64 vcc, exec, s[4:5] ; GFX908-NEXT: s_cbranch_vccnz .LBB4_4 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.private @@ -507,18 +504,15 @@ define i64 @optnone_atomicrmw_add_i64_expand(i64 %val) #1 { ; GFX90A-NEXT: s_add_u32 s6, s6, global@rel32@lo+4 ; GFX90A-NEXT: s_addc_u32 s7, s7, global@rel32@hi+12 ; GFX90A-NEXT: s_cmp_eq_u32 s7, s5 -; GFX90A-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: s_mov_b64 s[4:5], -1 -; GFX90A-NEXT: s_mov_b32 s6, 1 -; GFX90A-NEXT: v_cmp_ne_u32_e64 s[6:7], v2, s6 +; GFX90A-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5] ; GFX90A-NEXT: s_and_b64 vcc, exec, s[6:7] ; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: s_cbranch_vccnz .LBB4_3 ; GFX90A-NEXT: .LBB4_1: ; %Flow -; GFX90A-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] -; GFX90A-NEXT: s_mov_b32 s4, 1 -; GFX90A-NEXT: v_cmp_ne_u32_e64 s[4:5], v4, s4 +; GFX90A-NEXT: s_mov_b64 s[6:7], -1 +; GFX90A-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7] ; GFX90A-NEXT: s_and_b64 vcc, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_vccnz .LBB4_4 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.private @@ -556,23 +550,19 @@ define i64 @optnone_atomicrmw_add_i64_expand(i64 %val) #1 { ; GFX942-NEXT: s_add_u32 s2, s2, global@rel32@lo+4 ; GFX942-NEXT: s_addc_u32 s3, s3, global@rel32@hi+12 ; GFX942-NEXT: s_cmp_eq_u32 s3, s1 -; GFX942-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX942-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GFX942-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX942-NEXT: s_mov_b64 s[0:1], -1 -; GFX942-NEXT: s_mov_b32 s2, 1 -; GFX942-NEXT: v_cmp_ne_u32_e64 s[2:3], v2, s2 +; GFX942-NEXT: s_xor_b64 s[2:3], s[2:3], s[0:1] ; GFX942-NEXT: s_and_b64 vcc, exec, s[2:3] ; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX942-NEXT: s_cbranch_vccnz .LBB4_3 ; GFX942-NEXT: .LBB4_1: ; %Flow -; GFX942-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1] -; GFX942-NEXT: s_mov_b32 s0, 1 -; GFX942-NEXT: v_cmp_ne_u32_e64 s[0:1], v4, s0 +; GFX942-NEXT: s_mov_b64 s[2:3], -1 +; GFX942-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] ; GFX942-NEXT: s_and_b64 vcc, exec, s[0:1] ; GFX942-NEXT: s_cbranch_vccnz .LBB4_4 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.private ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: scratch_load_dwordx2 v[2:3], off, s0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 0, v[0:1] @@ -603,18 +593,15 @@ define i64 @optnone_atomicrmw_add_i64_expand(i64 %val) #1 { ; GFX1100-NEXT: s_add_u32 s2, s2, global@rel32@lo+4 ; GFX1100-NEXT: s_addc_u32 s3, s3, global@rel32@hi+12 ; GFX1100-NEXT: s_cmp_eq_u32 s3, s1 -; GFX1100-NEXT: s_cselect_b32 s0, -1, 0 -; GFX1100-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX1100-NEXT: s_cselect_b32 s1, -1, 0 ; GFX1100-NEXT: s_mov_b32 s0, -1 -; GFX1100-NEXT: s_mov_b32 s1, 1 -; GFX1100-NEXT: v_cmp_ne_u32_e64 s1, v2, s1 +; GFX1100-NEXT: s_xor_b32 s1, s1, s0 ; GFX1100-NEXT: s_and_b32 vcc_lo, exec_lo, s1 ; GFX1100-NEXT: ; implicit-def: $vgpr3_vgpr4 ; GFX1100-NEXT: s_cbranch_vccnz .LBB4_3 ; GFX1100-NEXT: .LBB4_1: ; %Flow -; GFX1100-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 -; GFX1100-NEXT: s_mov_b32 s0, 1 -; GFX1100-NEXT: v_cmp_ne_u32_e64 s0, v2, s0 +; GFX1100-NEXT: s_mov_b32 s1, -1 +; GFX1100-NEXT: s_xor_b32 s0, s0, s1 ; GFX1100-NEXT: s_and_b32 vcc_lo, exec_lo, s0 ; GFX1100-NEXT: s_cbranch_vccnz .LBB4_4 ; GFX1100-NEXT: ; %bb.2: ; %atomicrmw.private @@ -658,23 +645,20 @@ define i64 @optnone_atomicrmw_add_i64_expand(i64 %val) #1 { ; GFX1200-NEXT: s_add_co_ci_u32 s3, s3, global@rel32@hi+24 ; GFX1200-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX1200-NEXT: s_cmp_eq_u32 s3, s1 -; GFX1200-NEXT: s_cselect_b32 s0, -1, 0 -; GFX1200-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX1200-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX1200-NEXT: s_cselect_b32 s1, -1, 0 ; GFX1200-NEXT: s_mov_b32 s0, -1 -; GFX1200-NEXT: s_mov_b32 s1, 1 ; GFX1200-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX1200-NEXT: v_cmp_ne_u32_e64 s1, v2, s1 +; GFX1200-NEXT: s_xor_b32 s1, s1, s0 +; GFX1200-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX1200-NEXT: s_and_b32 vcc_lo, exec_lo, s1 ; GFX1200-NEXT: ; implicit-def: $vgpr3_vgpr4 ; GFX1200-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX1200-NEXT: s_cbranch_vccnz .LBB4_3 ; GFX1200-NEXT: .LBB4_1: ; %Flow +; GFX1200-NEXT: s_mov_b32 s1, -1 ; GFX1200-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX1200-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 -; GFX1200-NEXT: s_mov_b32 s0, 1 +; GFX1200-NEXT: s_xor_b32 s0, s0, s1 ; GFX1200-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX1200-NEXT: v_cmp_ne_u32_e64 s0, v2, s0 ; GFX1200-NEXT: s_and_b32 vcc_lo, exec_lo, s0 ; GFX1200-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX1200-NEXT: s_cbranch_vccnz .LBB4_4 @@ -722,11 +706,9 @@ define double @optnone_atomicrmw_fadd_f64_expand(double %val) #1 { ; GFX908-NEXT: s_add_u32 s6, s6, global@rel32@lo+4 ; GFX908-NEXT: s_addc_u32 s7, s7, global@rel32@hi+12 ; GFX908-NEXT: s_cmp_eq_u32 s7, s5 -; GFX908-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX908-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] +; GFX908-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX908-NEXT: s_mov_b64 s[4:5], -1 -; GFX908-NEXT: s_mov_b32 s6, 1 -; GFX908-NEXT: v_cmp_ne_u32_e64 s[6:7], v2, s6 +; GFX908-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5] ; GFX908-NEXT: s_and_b64 vcc, exec, s[6:7] ; GFX908-NEXT: ; implicit-def: $vgpr3_vgpr4 ; GFX908-NEXT: s_cbranch_vccnz .LBB5_2 @@ -789,18 +771,15 @@ define double @optnone_atomicrmw_fadd_f64_expand(double %val) #1 { ; GFX90A-NEXT: s_add_u32 s6, s6, global@rel32@lo+4 ; GFX90A-NEXT: s_addc_u32 s7, s7, global@rel32@hi+12 ; GFX90A-NEXT: s_cmp_eq_u32 s7, s5 -; GFX90A-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: s_mov_b64 s[4:5], -1 -; GFX90A-NEXT: s_mov_b32 s6, 1 -; GFX90A-NEXT: v_cmp_ne_u32_e64 s[6:7], v2, s6 +; GFX90A-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5] ; GFX90A-NEXT: s_and_b64 vcc, exec, s[6:7] ; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: s_cbranch_vccnz .LBB5_3 ; GFX90A-NEXT: .LBB5_1: ; %Flow4 -; GFX90A-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] -; GFX90A-NEXT: s_mov_b32 s4, 1 -; GFX90A-NEXT: v_cmp_ne_u32_e64 s[4:5], v4, s4 +; GFX90A-NEXT: s_mov_b64 s[6:7], -1 +; GFX90A-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7] ; GFX90A-NEXT: s_and_b64 vcc, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_vccnz .LBB5_10 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.shared @@ -812,11 +791,9 @@ define double @optnone_atomicrmw_fadd_f64_expand(double %val) #1 { ; GFX90A-NEXT: s_add_u32 s6, s6, global@rel32@lo+4 ; GFX90A-NEXT: s_addc_u32 s7, s7, global@rel32@hi+12 ; GFX90A-NEXT: s_cmp_eq_u32 s7, s5 -; GFX90A-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: s_mov_b64 s[4:5], -1 -; GFX90A-NEXT: s_mov_b32 s6, 1 -; GFX90A-NEXT: v_cmp_ne_u32_e64 s[6:7], v2, s6 +; GFX90A-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5] ; GFX90A-NEXT: s_and_b64 vcc, exec, s[6:7] ; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: s_cbranch_vccnz .LBB5_5 @@ -881,18 +858,15 @@ define double @optnone_atomicrmw_fadd_f64_expand(double %val) #1 { ; GFX942-NEXT: s_add_u32 s2, s2, global@rel32@lo+4 ; GFX942-NEXT: s_addc_u32 s3, s3, global@rel32@hi+12 ; GFX942-NEXT: s_cmp_eq_u32 s3, s1 -; GFX942-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX942-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GFX942-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX942-NEXT: s_mov_b64 s[0:1], -1 -; GFX942-NEXT: s_mov_b32 s2, 1 -; GFX942-NEXT: v_cmp_ne_u32_e64 s[2:3], v2, s2 +; GFX942-NEXT: s_xor_b64 s[2:3], s[2:3], s[0:1] ; GFX942-NEXT: s_and_b64 vcc, exec, s[2:3] ; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX942-NEXT: s_cbranch_vccnz .LBB5_3 ; GFX942-NEXT: .LBB5_1: ; %Flow4 -; GFX942-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1] -; GFX942-NEXT: s_mov_b32 s0, 1 -; GFX942-NEXT: v_cmp_ne_u32_e64 s[0:1], v4, s0 +; GFX942-NEXT: s_mov_b64 s[2:3], -1 +; GFX942-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] ; GFX942-NEXT: s_and_b64 vcc, exec, s[0:1] ; GFX942-NEXT: s_cbranch_vccnz .LBB5_10 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.shared @@ -904,11 +878,9 @@ define double @optnone_atomicrmw_fadd_f64_expand(double %val) #1 { ; GFX942-NEXT: s_add_u32 s2, s2, global@rel32@lo+4 ; GFX942-NEXT: s_addc_u32 s3, s3, global@rel32@hi+12 ; GFX942-NEXT: s_cmp_eq_u32 s3, s1 -; GFX942-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX942-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GFX942-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX942-NEXT: s_mov_b64 s[0:1], -1 -; GFX942-NEXT: s_mov_b32 s2, 1 -; GFX942-NEXT: v_cmp_ne_u32_e64 s[2:3], v2, s2 +; GFX942-NEXT: s_xor_b64 s[2:3], s[2:3], s[0:1] ; GFX942-NEXT: s_and_b64 vcc, exec, s[2:3] ; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX942-NEXT: s_cbranch_vccnz .LBB5_5 @@ -971,11 +943,9 @@ define double @optnone_atomicrmw_fadd_f64_expand(double %val) #1 { ; GFX1100-NEXT: s_add_u32 s2, s2, global@rel32@lo+4 ; GFX1100-NEXT: s_addc_u32 s3, s3, global@rel32@hi+12 ; GFX1100-NEXT: s_cmp_eq_u32 s3, s1 -; GFX1100-NEXT: s_cselect_b32 s0, -1, 0 -; GFX1100-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX1100-NEXT: s_cselect_b32 s1, -1, 0 ; GFX1100-NEXT: s_mov_b32 s0, -1 -; GFX1100-NEXT: s_mov_b32 s1, 1 -; GFX1100-NEXT: v_cmp_ne_u32_e64 s1, v2, s1 +; GFX1100-NEXT: s_xor_b32 s1, s1, s0 ; GFX1100-NEXT: s_and_b32 vcc_lo, exec_lo, s1 ; GFX1100-NEXT: ; implicit-def: $vgpr3_vgpr4 ; GFX1100-NEXT: s_cbranch_vccnz .LBB5_2 @@ -1043,13 +1013,11 @@ define double @optnone_atomicrmw_fadd_f64_expand(double %val) #1 { ; GFX1200-NEXT: s_add_co_ci_u32 s3, s3, global@rel32@hi+24 ; GFX1200-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX1200-NEXT: s_cmp_eq_u32 s3, s1 -; GFX1200-NEXT: s_cselect_b32 s0, -1, 0 -; GFX1200-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX1200-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX1200-NEXT: s_cselect_b32 s1, -1, 0 ; GFX1200-NEXT: s_mov_b32 s0, -1 -; GFX1200-NEXT: s_mov_b32 s1, 1 ; GFX1200-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX1200-NEXT: v_cmp_ne_u32_e64 s1, v2, s1 +; GFX1200-NEXT: s_xor_b32 s1, s1, s0 +; GFX1200-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX1200-NEXT: s_and_b32 vcc_lo, exec_lo, s1 ; GFX1200-NEXT: ; implicit-def: $vgpr3_vgpr4 ; GFX1200-NEXT: s_wait_alu depctr_sa_sdst(0) diff --git a/llvm/test/CodeGen/AMDGPU/basic-branch.ll b/llvm/test/CodeGen/AMDGPU/basic-branch.ll index 3706eb5166af..d20cc7379598 100644 --- a/llvm/test/CodeGen/AMDGPU/basic-branch.ll +++ b/llvm/test/CodeGen/AMDGPU/basic-branch.ll @@ -8,7 +8,8 @@ ; GCN-LABEL: {{^}}test_branch: ; GCNNOOPT: v_writelane_b32 ; GCNNOOPT: v_writelane_b32 -; GCN: s_cbranch_scc1 [[END:.LBB[0-9]+_[0-9]+]] +; GCNNOOPT: s_cbranch_vccnz [[END:.LBB[0-9]+_[0-9]+]] +; GCNOPT: s_cbranch_scc1 [[END:.LBB[0-9]+_[0-9]+]] ; GCNNOOPT: v_readlane_b32 ; GCNNOOPT: v_readlane_b32 @@ -31,11 +32,11 @@ end: ; GCN-LABEL: {{^}}test_brcc_i1: ; GCN: s_load_{{dword|b32}} [[VAL:s[0-9]+]] -; GCNNOOPT: s_mov_b32 [[ONE:s[0-9]+]], 1{{$}} -; GCNNOOPT: s_and_b32 s{{[0-9]+}}, [[VAL]], [[ONE]] +; GCNNOOPT: s_and_b32 s{{[0-9]+}}, 1, [[VAL]] ; GCNOPT: s_bitcmp0_b32 [[VAL]], 0 -; GCNNOOPT: s_cmp_eq_u32 -; GCN: s_cbranch_scc1 [[END:.LBB[0-9]+_[0-9]+]] +; GCNNOOPT: s_cmp_{{eq|lg}}_u32 +; GCNNOOPT: s_cbranch_vccnz [[END:.LBB[0-9]+_[0-9]+]] +; GCNOPT: s_cbranch_scc1 [[END:.LBB[0-9]+_[0-9]+]] ; GCN: buffer_store_{{dword|b32}} diff --git a/llvm/test/CodeGen/AMDGPU/bb-prolog-spill-during-regalloc.ll b/llvm/test/CodeGen/AMDGPU/bb-prolog-spill-during-regalloc.ll index d4ef12a2b9ad..dfeeb02ee5af 100644 --- a/llvm/test/CodeGen/AMDGPU/bb-prolog-spill-during-regalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/bb-prolog-spill-during-regalloc.ll @@ -10,8 +10,8 @@ define i32 @prolog_spill(i32 %arg0, i32 %arg1, i32 %arg2) { ; REGALLOC-NEXT: {{ $}} ; REGALLOC-NEXT: SI_SPILL_V32_SAVE killed $vgpr2, %stack.4, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.4, addrspace 5) ; REGALLOC-NEXT: SI_SPILL_V32_SAVE killed $vgpr1, %stack.3, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5) - ; REGALLOC-NEXT: renamable $sgpr4 = S_MOV_B32 49 - ; REGALLOC-NEXT: renamable $sgpr4_sgpr5 = V_CMP_GT_I32_e64 killed $vgpr0, killed $sgpr4, implicit $exec + ; REGALLOC-NEXT: renamable $sgpr4 = S_MOV_B32 50 + ; REGALLOC-NEXT: renamable $sgpr4_sgpr5 = V_CMP_GE_I32_e64 killed $vgpr0, killed $sgpr4, implicit $exec ; REGALLOC-NEXT: renamable $vgpr0 = IMPLICIT_DEF ; REGALLOC-NEXT: renamable $sgpr6_sgpr7 = COPY $exec, implicit-def $exec ; REGALLOC-NEXT: renamable $sgpr4_sgpr5 = S_AND_B64 renamable $sgpr6_sgpr7, killed renamable $sgpr4_sgpr5, implicit-def dead $scc diff --git a/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll b/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll index 7660445b8ab3..a60064564341 100644 --- a/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll +++ b/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll @@ -40,6 +40,7 @@ define amdgpu_kernel void @test_loop(ptr addrspace(3) %ptr, i32 %n) nounwind { ; GCN_DBG-NEXT: s_add_u32 s12, s12, s11 ; GCN_DBG-NEXT: s_addc_u32 s13, s13, 0 ; GCN_DBG-NEXT: s_load_dword s0, s[4:5], 0x9 +; GCN_DBG-NEXT: s_load_dword s1, s[4:5], 0xa ; GCN_DBG-NEXT: ; implicit-def: $vgpr2 : SGPR spill to VGPR lane ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) ; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 0 @@ -47,13 +48,16 @@ define amdgpu_kernel void @test_loop(ptr addrspace(3) %ptr, i32 %n) nounwind { ; GCN_DBG-NEXT: s_mov_b32 s0, 0 ; GCN_DBG-NEXT: s_mov_b32 s2, -1 ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) -; GCN_DBG-NEXT: s_cmp_lg_u32 s1, s2 +; GCN_DBG-NEXT: s_cmp_eq_u32 s1, s2 +; GCN_DBG-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN_DBG-NEXT: s_mov_b64 s[4:5], -1 +; GCN_DBG-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5] +; GCN_DBG-NEXT: s_and_b64 vcc, exec, s[2:3] ; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 1 -; GCN_DBG-NEXT: s_mov_b64 s[6:7], exec -; GCN_DBG-NEXT: s_mov_b64 exec, -1 +; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN_DBG-NEXT: buffer_store_dword v2, off, s[12:15], 0 ; 4-byte Folded Spill ; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] -; GCN_DBG-NEXT: s_cbranch_scc1 .LBB0_2 +; GCN_DBG-NEXT: s_cbranch_vccnz .LBB0_2 ; GCN_DBG-NEXT: ; %bb.1: ; %for.exit ; GCN_DBG-NEXT: s_endpgm ; GCN_DBG-NEXT: .LBB0_2: ; %for.body @@ -64,11 +68,11 @@ define amdgpu_kernel void @test_loop(ptr addrspace(3) %ptr, i32 %n) nounwind { ; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] ; GCN_DBG-NEXT: s_waitcnt vmcnt(0) ; GCN_DBG-NEXT: v_readlane_b32 s0, v2, 1 -; GCN_DBG-NEXT: v_readlane_b32 s2, v2, 0 -; GCN_DBG-NEXT: s_mov_b32 s1, 2 -; GCN_DBG-NEXT: s_lshl_b32 s1, s0, s1 -; GCN_DBG-NEXT: s_add_i32 s1, s1, s2 -; GCN_DBG-NEXT: s_mov_b32 s2, 0x80 +; GCN_DBG-NEXT: v_readlane_b32 s1, v2, 0 +; GCN_DBG-NEXT: s_mov_b32 s2, 32 +; GCN_DBG-NEXT: s_add_i32 s2, s0, s2 +; GCN_DBG-NEXT: s_mov_b32 s3, 2 +; GCN_DBG-NEXT: s_lshl_b32 s2, s2, s3 ; GCN_DBG-NEXT: s_add_i32 s1, s1, s2 ; GCN_DBG-NEXT: s_mov_b32 m0, -1 ; GCN_DBG-NEXT: v_mov_b32_e32 v0, s1 @@ -141,6 +145,7 @@ define amdgpu_kernel void @loop_const_true(ptr addrspace(3) %ptr, i32 %n) nounwi ; GCN_DBG-NEXT: s_add_u32 s12, s12, s11 ; GCN_DBG-NEXT: s_addc_u32 s13, s13, 0 ; GCN_DBG-NEXT: s_load_dword s0, s[4:5], 0x9 +; GCN_DBG-NEXT: s_load_dword s1, s[4:5], 0xa ; GCN_DBG-NEXT: ; implicit-def: $vgpr2 : SGPR spill to VGPR lane ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) ; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 0 @@ -160,11 +165,11 @@ define amdgpu_kernel void @loop_const_true(ptr addrspace(3) %ptr, i32 %n) nounwi ; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] ; GCN_DBG-NEXT: s_waitcnt vmcnt(0) ; GCN_DBG-NEXT: v_readlane_b32 s0, v2, 1 -; GCN_DBG-NEXT: v_readlane_b32 s2, v2, 0 -; GCN_DBG-NEXT: s_mov_b32 s1, 2 -; GCN_DBG-NEXT: s_lshl_b32 s1, s0, s1 -; GCN_DBG-NEXT: s_add_i32 s1, s1, s2 -; GCN_DBG-NEXT: s_mov_b32 s2, 0x80 +; GCN_DBG-NEXT: v_readlane_b32 s1, v2, 0 +; GCN_DBG-NEXT: s_mov_b32 s2, 32 +; GCN_DBG-NEXT: s_add_i32 s2, s0, s2 +; GCN_DBG-NEXT: s_mov_b32 s3, 2 +; GCN_DBG-NEXT: s_lshl_b32 s2, s2, s3 ; GCN_DBG-NEXT: s_add_i32 s1, s1, s2 ; GCN_DBG-NEXT: s_mov_b32 m0, -1 ; GCN_DBG-NEXT: v_mov_b32_e32 v0, s1 @@ -224,6 +229,7 @@ define amdgpu_kernel void @loop_const_false(ptr addrspace(3) %ptr, i32 %n) nounw ; GCN_DBG-NEXT: s_add_u32 s12, s12, s11 ; GCN_DBG-NEXT: s_addc_u32 s13, s13, 0 ; GCN_DBG-NEXT: s_load_dword s0, s[4:5], 0x9 +; GCN_DBG-NEXT: s_load_dword s1, s[4:5], 0xa ; GCN_DBG-NEXT: ; implicit-def: $vgpr2 : SGPR spill to VGPR lane ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) ; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 0 @@ -243,11 +249,11 @@ define amdgpu_kernel void @loop_const_false(ptr addrspace(3) %ptr, i32 %n) nounw ; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] ; GCN_DBG-NEXT: s_waitcnt vmcnt(0) ; GCN_DBG-NEXT: v_readlane_b32 s0, v2, 1 -; GCN_DBG-NEXT: v_readlane_b32 s2, v2, 0 -; GCN_DBG-NEXT: s_mov_b32 s1, 2 -; GCN_DBG-NEXT: s_lshl_b32 s1, s0, s1 -; GCN_DBG-NEXT: s_add_i32 s1, s1, s2 -; GCN_DBG-NEXT: s_mov_b32 s2, 0x80 +; GCN_DBG-NEXT: v_readlane_b32 s1, v2, 0 +; GCN_DBG-NEXT: s_mov_b32 s2, 32 +; GCN_DBG-NEXT: s_add_i32 s2, s0, s2 +; GCN_DBG-NEXT: s_mov_b32 s3, 2 +; GCN_DBG-NEXT: s_lshl_b32 s2, s2, s3 ; GCN_DBG-NEXT: s_add_i32 s1, s1, s2 ; GCN_DBG-NEXT: s_mov_b32 m0, -1 ; GCN_DBG-NEXT: v_mov_b32_e32 v0, s1 @@ -308,6 +314,7 @@ define amdgpu_kernel void @loop_const_undef(ptr addrspace(3) %ptr, i32 %n) nounw ; GCN_DBG-NEXT: s_add_u32 s12, s12, s11 ; GCN_DBG-NEXT: s_addc_u32 s13, s13, 0 ; GCN_DBG-NEXT: s_load_dword s0, s[4:5], 0x9 +; GCN_DBG-NEXT: s_load_dword s1, s[4:5], 0xa ; GCN_DBG-NEXT: ; implicit-def: $vgpr2 : SGPR spill to VGPR lane ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) ; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 0 @@ -327,11 +334,11 @@ define amdgpu_kernel void @loop_const_undef(ptr addrspace(3) %ptr, i32 %n) nounw ; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] ; GCN_DBG-NEXT: s_waitcnt vmcnt(0) ; GCN_DBG-NEXT: v_readlane_b32 s0, v2, 1 -; GCN_DBG-NEXT: v_readlane_b32 s2, v2, 0 -; GCN_DBG-NEXT: s_mov_b32 s1, 2 -; GCN_DBG-NEXT: s_lshl_b32 s1, s0, s1 -; GCN_DBG-NEXT: s_add_i32 s1, s1, s2 -; GCN_DBG-NEXT: s_mov_b32 s2, 0x80 +; GCN_DBG-NEXT: v_readlane_b32 s1, v2, 0 +; GCN_DBG-NEXT: s_mov_b32 s2, 32 +; GCN_DBG-NEXT: s_add_i32 s2, s0, s2 +; GCN_DBG-NEXT: s_mov_b32 s3, 2 +; GCN_DBG-NEXT: s_lshl_b32 s2, s2, s3 ; GCN_DBG-NEXT: s_add_i32 s1, s1, s2 ; GCN_DBG-NEXT: s_mov_b32 m0, -1 ; GCN_DBG-NEXT: v_mov_b32_e32 v0, s1 @@ -407,6 +414,7 @@ define amdgpu_kernel void @loop_arg_0(ptr addrspace(3) %ptr, i32 %n) nounwind { ; GCN_DBG-NEXT: s_add_u32 s12, s12, s11 ; GCN_DBG-NEXT: s_addc_u32 s13, s13, 0 ; GCN_DBG-NEXT: s_load_dword s0, s[4:5], 0x9 +; GCN_DBG-NEXT: s_load_dword s1, s[4:5], 0xa ; GCN_DBG-NEXT: ; implicit-def: $vgpr2 : SGPR spill to VGPR lane ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) ; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 0 @@ -440,11 +448,11 @@ define amdgpu_kernel void @loop_arg_0(ptr addrspace(3) %ptr, i32 %n) nounwind { ; GCN_DBG-NEXT: v_readlane_b32 s0, v2, 3 ; GCN_DBG-NEXT: v_readlane_b32 s2, v2, 1 ; GCN_DBG-NEXT: v_readlane_b32 s3, v2, 2 -; GCN_DBG-NEXT: v_readlane_b32 s4, v2, 0 -; GCN_DBG-NEXT: s_mov_b32 s1, 2 -; GCN_DBG-NEXT: s_lshl_b32 s1, s0, s1 -; GCN_DBG-NEXT: s_add_i32 s1, s1, s4 -; GCN_DBG-NEXT: s_mov_b32 s4, 0x80 +; GCN_DBG-NEXT: v_readlane_b32 s1, v2, 0 +; GCN_DBG-NEXT: s_mov_b32 s4, 32 +; GCN_DBG-NEXT: s_add_i32 s4, s0, s4 +; GCN_DBG-NEXT: s_mov_b32 s5, 2 +; GCN_DBG-NEXT: s_lshl_b32 s4, s4, s5 ; GCN_DBG-NEXT: s_add_i32 s1, s1, s4 ; GCN_DBG-NEXT: s_mov_b32 m0, -1 ; GCN_DBG-NEXT: v_mov_b32_e32 v0, s1 diff --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll index 91625eb1af8f..2f287d269e4d 100644 --- a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll +++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll @@ -49,6 +49,8 @@ define amdgpu_kernel void @simple_nested_if(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: s_add_u32 s12, s12, s11 ; GCN-O0-NEXT: s_addc_u32 s13, s13, 0 ; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GCN-O0-NEXT: ; implicit-def: $vgpr4 : SGPR spill to VGPR lane ; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) ; GCN-O0-NEXT: v_writelane_b32 v4, s0, 0 @@ -218,6 +220,8 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a ; GCN-O0-NEXT: s_add_u32 s12, s12, s11 ; GCN-O0-NEXT: s_addc_u32 s13, s13, 0 ; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GCN-O0-NEXT: ; implicit-def: $vgpr4 : SGPR spill to VGPR lane ; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) ; GCN-O0-NEXT: v_writelane_b32 v4, s0, 0 @@ -422,6 +426,8 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: s_addc_u32 s13, s13, 0 ; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) ; GCN-O0-NEXT: s_mov_b64 s[2:3], s[0:1] ; GCN-O0-NEXT: ; implicit-def: $vgpr4 : SGPR spill to VGPR lane ; GCN-O0-NEXT: v_writelane_b32 v4, s2, 0 @@ -434,12 +440,12 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: s_mov_b32 s5, s2 ; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5] -; GCN-O0-NEXT: s_mov_b32 s4, 2 -; GCN-O0-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GCN-O0-NEXT: v_ashrrev_i32_e64 v3, 31, v0 ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: v_mov_b32_e32 v1, 0 -; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v3, v1 +; GCN-O0-NEXT: v_mov_b32_e32 v1, v0 +; GCN-O0-NEXT: v_mov_b32_e32 v2, v3 +; GCN-O0-NEXT: s_mov_b32 s4, 2 +; GCN-O0-NEXT: v_lshl_b64 v[2:3], v[1:2], s4 ; GCN-O0-NEXT: v_mov_b32_e32 v1, 0 ; GCN-O0-NEXT: buffer_store_dword v1, v[2:3], s[0:3], 0 addr64 ; GCN-O0-NEXT: s_mov_b32 s0, 1 @@ -660,36 +666,39 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: s_mov_b32 s15, 0xe8f000 ; GCN-O0-NEXT: s_add_u32 s12, s12, s11 ; GCN-O0-NEXT: s_addc_u32 s13, s13, 0 -; GCN-O0-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GCN-O0-NEXT: v_mov_b32_e32 v1, v0 ; GCN-O0-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b32 s0, 2 -; GCN-O0-NEXT: v_lshlrev_b32_e64 v2, s0, v0 +; GCN-O0-NEXT: v_ashrrev_i32_e64 v3, 31, v0 ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: v_mov_b32_e32 v1, 0 -; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v3, v1 +; GCN-O0-NEXT: v_mov_b32_e32 v1, v0 +; GCN-O0-NEXT: v_mov_b32_e32 v2, v3 +; GCN-O0-NEXT: s_mov_b32 s2, 2 +; GCN-O0-NEXT: v_lshl_b64 v[2:3], v[1:2], s2 ; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: s_mov_b32 s2, s4 +; GCN-O0-NEXT: s_mov_b32 s2, s0 ; GCN-O0-NEXT: v_mov_b32_e32 v1, v2 -; GCN-O0-NEXT: s_mov_b32 s1, s5 +; GCN-O0-NEXT: s_mov_b32 s4, s1 ; GCN-O0-NEXT: v_mov_b32_e32 v5, v3 ; GCN-O0-NEXT: v_add_i32_e64 v4, s[2:3], s2, v1 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s4 ; GCN-O0-NEXT: v_addc_u32_e64 v1, s[2:3], v1, v5, s[2:3] ; GCN-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec ; GCN-O0-NEXT: v_mov_b32_e32 v5, v1 ; GCN-O0-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill ; GCN-O0-NEXT: buffer_store_dword v5, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b32 s1, 0xf000 -; GCN-O0-NEXT: s_mov_b32 s2, 0 -; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s1 -; GCN-O0-NEXT: ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7 -; GCN-O0-NEXT: s_mov_b64 s[6:7], s[2:3] +; GCN-O0-NEXT: s_mov_b32 s2, 0xf000 +; GCN-O0-NEXT: s_mov_b32 s4, 0 +; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GCN-O0-NEXT: s_mov_b32 s5, s2 +; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3 +; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5] ; GCN-O0-NEXT: v_mov_b32_e32 v1, 0 -; GCN-O0-NEXT: buffer_store_dword v1, v[2:3], s[4:7], 0 addr64 -; GCN-O0-NEXT: v_cmp_lt_u32_e64 s[0:1], v0, s0 +; GCN-O0-NEXT: buffer_store_dword v1, v[2:3], s[0:3], 0 addr64 +; GCN-O0-NEXT: s_mov_b32 s0, 1 +; GCN-O0-NEXT: v_cmp_le_u32_e64 s[0:1], v0, s0 ; GCN-O0-NEXT: s_mov_b64 s[2:3], exec ; GCN-O0-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1] ; GCN-O0-NEXT: s_xor_b64 s[2:3], s[0:1], s[2:3] @@ -905,6 +914,8 @@ define amdgpu_kernel void @s_endpgm_unsafe_barrier(ptr addrspace(1) nocapture %a ; GCN-O0-NEXT: s_add_u32 s12, s12, s11 ; GCN-O0-NEXT: s_addc_u32 s13, s13, 0 ; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GCN-O0-NEXT: ; implicit-def: $vgpr3 : SGPR spill to VGPR lane ; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) ; GCN-O0-NEXT: v_writelane_b32 v3, s0, 0 @@ -1059,15 +1070,15 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 { ; GCN-O0-NEXT: v_writelane_b32 v7, s7, 1 ; GCN-O0-NEXT: v_writelane_b32 v7, s4, 2 ; GCN-O0-NEXT: v_writelane_b32 v7, s5, 3 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[12:13], -1 ; GCN-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] +; GCN-O0-NEXT: s_mov_b64 exec, s[12:13] ; GCN-O0-NEXT: .LBB5_1: ; %bb1 ; GCN-O0-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[12:13], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] +; GCN-O0-NEXT: s_mov_b64 exec, s[12:13] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: v_readlane_b32 s8, v7, 2 ; GCN-O0-NEXT: v_readlane_b32 s9, v7, 3 @@ -1087,26 +1098,27 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 { ; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5] ; GCN-O0-NEXT: v_writelane_b32 v7, s6, 2 ; GCN-O0-NEXT: v_writelane_b32 v7, s7, 3 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[12:13], -1 ; GCN-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] +; GCN-O0-NEXT: s_mov_b64 exec, s[12:13] ; GCN-O0-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GCN-O0-NEXT: s_cbranch_execnz .LBB5_1 ; GCN-O0-NEXT: ; %bb.2: ; %bb2 ; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[12:13], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] +; GCN-O0-NEXT: s_mov_b64 exec, s[12:13] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: v_readlane_b32 s4, v7, 6 ; GCN-O0-NEXT: v_readlane_b32 s5, v7, 7 ; GCN-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b32 s6, 0 +; GCN-O0-NEXT: s_mov_b32 s4, 0 ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_cmp_ne_u32_e64 s[4:5], v0, s6 -; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s4 +; GCN-O0-NEXT: s_mov_b64 s[4:5], -1 +; GCN-O0-NEXT: s_xor_b64 s[4:5], s[6:7], s[4:5] ; GCN-O0-NEXT: v_writelane_b32 v7, s4, 8 ; GCN-O0-NEXT: v_writelane_b32 v7, s5, 9 ; GCN-O0-NEXT: s_mov_b32 s4, 0 @@ -1125,18 +1137,18 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 { ; GCN-O0-NEXT: s_mov_b64 s[4:5], exec ; GCN-O0-NEXT: v_writelane_b32 v7, s4, 10 ; GCN-O0-NEXT: v_writelane_b32 v7, s5, 11 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[12:13], -1 ; GCN-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] +; GCN-O0-NEXT: s_mov_b64 exec, s[12:13] ; GCN-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] ; GCN-O0-NEXT: s_cbranch_execz .LBB5_5 ; GCN-O0-NEXT: ; %bb.3: ; %bb4 ; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[12:13], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] +; GCN-O0-NEXT: s_mov_b64 exec, s[12:13] ; GCN-O0-NEXT: ; implicit-def: $sgpr4 ; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 ; GCN-O0-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen @@ -1158,25 +1170,17 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 { ; GCN-O0-NEXT: s_mov_b64 s[4:5], exec ; GCN-O0-NEXT: v_writelane_b32 v7, s4, 12 ; GCN-O0-NEXT: v_writelane_b32 v7, s5, 13 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[12:13], -1 ; GCN-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] +; GCN-O0-NEXT: s_mov_b64 exec, s[12:13] ; GCN-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] ; GCN-O0-NEXT: s_cbranch_execz .LBB5_6 ; GCN-O0-NEXT: ; %bb.4: ; %bb8 ; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1 -; GCN-O0-NEXT: s_mov_b32 s10, 0 -; GCN-O0-NEXT: ; implicit-def: $sgpr4 -; GCN-O0-NEXT: ; implicit-def: $sgpr5 -; GCN-O0-NEXT: ; implicit-def: $sgpr9 -; GCN-O0-NEXT: ; implicit-def: $sgpr5 -; GCN-O0-NEXT: ; implicit-def: $sgpr8 -; GCN-O0-NEXT: ; implicit-def: $sgpr5 -; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 -; GCN-O0-NEXT: s_mov_b32 s5, s10 -; GCN-O0-NEXT: s_mov_b32 s6, s9 -; GCN-O0-NEXT: s_mov_b32 s7, s8 +; GCN-O0-NEXT: s_mov_b32 s8, 0 +; GCN-O0-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 +; GCN-O0-NEXT: s_mov_b32 s5, s8 ; GCN-O0-NEXT: s_waitcnt expcnt(1) ; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 ; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 @@ -1189,10 +1193,10 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 { ; GCN-O0-NEXT: s_branch .LBB5_6 ; GCN-O0-NEXT: .LBB5_5: ; %Flow2 ; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[12:13], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] +; GCN-O0-NEXT: s_mov_b64 exec, s[12:13] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: v_readlane_b32 s4, v7, 10 ; GCN-O0-NEXT: v_readlane_b32 s5, v7, 11 @@ -1212,10 +1216,10 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 { ; GCN-O0-NEXT: s_branch .LBB5_7 ; GCN-O0-NEXT: .LBB5_6: ; %Flow ; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[12:13], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] +; GCN-O0-NEXT: s_mov_b64 exec, s[12:13] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: v_readlane_b32 s4, v7, 12 ; GCN-O0-NEXT: v_readlane_b32 s5, v7, 13 @@ -1235,9 +1239,9 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 { ; GCN-O0-NEXT: s_branch .LBB5_5 ; GCN-O0-NEXT: .LBB5_7: ; %bb10 ; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[12:13], -1 ; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] +; GCN-O0-NEXT: s_mov_b64 exec, s[12:13] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: v_readlane_b32 s6, v7, 8 ; GCN-O0-NEXT: v_readlane_b32 s7, v7, 9 @@ -1247,32 +1251,32 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 { ; GCN-O0-NEXT: s_mov_b64 s[4:5], exec ; GCN-O0-NEXT: v_writelane_b32 v7, s4, 16 ; GCN-O0-NEXT: v_writelane_b32 v7, s5, 17 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[12:13], -1 ; GCN-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] +; GCN-O0-NEXT: s_mov_b64 exec, s[12:13] ; GCN-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] ; GCN-O0-NEXT: s_cbranch_execz .LBB5_9 ; GCN-O0-NEXT: ; %bb.8: ; %Flow1 ; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[12:13], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] +; GCN-O0-NEXT: s_mov_b64 exec, s[12:13] ; GCN-O0-NEXT: s_mov_b64 s[4:5], 0 ; GCN-O0-NEXT: s_xor_b64 s[4:5], exec, -1 ; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: v_writelane_b32 v7, s4, 14 ; GCN-O0-NEXT: v_writelane_b32 v7, s5, 15 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[12:13], -1 ; GCN-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] +; GCN-O0-NEXT: s_mov_b64 exec, s[12:13] ; GCN-O0-NEXT: .LBB5_9: ; %Flow3 ; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[12:13], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] +; GCN-O0-NEXT: s_mov_b64 exec, s[12:13] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: v_readlane_b32 s8, v7, 16 ; GCN-O0-NEXT: v_readlane_b32 s9, v7, 17 @@ -1296,9 +1300,9 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 { ; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5] ; GCN-O0-NEXT: v_writelane_b32 v7, s6, 18 ; GCN-O0-NEXT: v_writelane_b32 v7, s7, 19 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[12:13], -1 ; GCN-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] +; GCN-O0-NEXT: s_mov_b64 exec, s[12:13] ; GCN-O0-NEXT: s_waitcnt vmcnt(4) ; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_waitcnt vmcnt(4) @@ -1310,10 +1314,10 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 { ; GCN-O0-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GCN-O0-NEXT: s_cbranch_execnz .LBB5_1 ; GCN-O0-NEXT: ; %bb.10: ; %bb12 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[12:13], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(4) ; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] +; GCN-O0-NEXT: s_mov_b64 exec, s[12:13] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: v_readlane_b32 s4, v7, 18 ; GCN-O0-NEXT: v_readlane_b32 s5, v7, 19 diff --git a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll index 255dff3d5723..26c97a9a3b1a 100644 --- a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll @@ -110,12 +110,12 @@ endif: ; GCN: [[LOOP:.LBB[0-9]+_[0-9]+]]: ; GCN: buffer_load_dword v[[VAL_LOOP_RELOAD:[0-9]+]], off, s[0:3], 0 offset:[[LOAD0_OFFSET]] ; 4-byte Folded Reload ; GCN: v_sub_i32_e32 v[[VAL_LOOP_RELOAD]], vcc, v[[VAL_LOOP_RELOAD]], v{{[0-9]+}} -; GCN: s_cmp_lg_u32 +; GCN: s_cmp_{{lg|eq}}_u32 ; VMEM: buffer_store_dword ; VMEM: buffer_store_dword ; VMEM: buffer_store_dword ; GCN: buffer_store_dword v[[VAL_LOOP_RELOAD]], off, s[0:3], 0 offset:{{[0-9]+}} ; 4-byte Folded Spill -; GCN-NEXT: s_cbranch_scc1 [[LOOP]] +; GCN-NEXT: s_cbranch_vccnz [[LOOP]] ; GCN: buffer_store_dword v[[VAL_LOOP_RELOAD]], off, s[0:3], 0 offset:[[VAL_SUB_OFFSET:[0-9]+]] ; 4-byte Folded Spill diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-lshr-and-cmp.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-lshr-and-cmp.ll index 81fda9864791..126916beddb5 100644 --- a/llvm/test/CodeGen/AMDGPU/dagcombine-lshr-and-cmp.ll +++ b/llvm/test/CodeGen/AMDGPU/dagcombine-lshr-and-cmp.ll @@ -8,16 +8,18 @@ define i32 @divergent_lshr_and_cmp(i32 %x) { ; GCN-NEXT: liveins: $vgpr0 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GCN-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 2, [[COPY]], implicit $exec - ; GCN-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 killed [[V_AND_B32_e64_]], 0, implicit $exec + ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 2 + ; GCN-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY]], killed [[S_MOV_B32_]], implicit $exec + ; GCN-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GCN-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 killed [[V_AND_B32_e64_]], killed [[S_MOV_B32_1]], implicit $exec ; GCN-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_NE_U32_e64_]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GCN-NEXT: S_BRANCH %bb.1 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.1.out.true: ; GCN-NEXT: successors: %bb.2(0x80000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 2 - ; GCN-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 killed [[S_MOV_B32_]], [[COPY]], implicit $exec + ; GCN-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 2 + ; GCN-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 killed [[S_MOV_B32_2]], [[COPY]], implicit $exec ; GCN-NEXT: S_BRANCH %bb.2 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.2.UnifiedReturnBlock: @@ -45,40 +47,52 @@ define amdgpu_kernel void @uniform_opt_lshr_and_cmp(ptr addrspace(1) %out, i32 % ; GCN-NEXT: liveins: $sgpr4_sgpr5 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 - ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset, align 4, addrspace 4) - ; GCN-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 11, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4) - ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]] + ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s64) from constant-pool + 36, align 4, addrspace 4) + ; GCN-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 11, 0 :: (dereferenceable invariant load (s32) from constant-pool + 44, addrspace 4) + ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset, align 4, addrspace 4) + ; GCN-NEXT: [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 11, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4) + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM1]] ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 2 - ; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORD_IMM]], killed [[S_MOV_B32_]], implicit-def dead $scc - ; GCN-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 2, [[S_LOAD_DWORD_IMM]], implicit-def dead $scc - ; GCN-NEXT: S_CMP_LG_U32 killed [[S_AND_B32_1]], 0, implicit-def $scc + ; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 killed [[S_LOAD_DWORD_IMM1]], killed [[S_MOV_B32_]], implicit-def dead $scc + ; GCN-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GCN-NEXT: S_CMP_LG_U32 killed [[S_AND_B32_]], killed [[S_MOV_B32_1]], implicit-def $scc ; GCN-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY $scc ; GCN-NEXT: [[COPY3:%[0-9]+]]:sreg_64_xexec = COPY [[COPY2]] - ; GCN-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GCN-NEXT: S_CMP_EQ_U32 killed [[S_AND_B32_]], killed [[S_MOV_B32_1]], implicit-def $scc - ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; GCN-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1 + ; GCN-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64 = S_XOR_B64 [[COPY2]], killed [[S_MOV_B64_]], implicit-def dead $scc + ; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 $exec, killed [[S_XOR_B64_]], implicit-def dead $scc + ; GCN-NEXT: $vcc = COPY [[S_AND_B64_]] + ; GCN-NEXT: S_CBRANCH_VCCNZ %bb.2, implicit $vcc ; GCN-NEXT: S_BRANCH %bb.1 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.1.out.true: - ; GCN-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1 - ; GCN-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[COPY3]], killed [[S_MOV_B64_]], implicit-def dead $scc + ; GCN-NEXT: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 -1 + ; GCN-NEXT: [[S_XOR_B64_1:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[COPY3]], killed [[S_MOV_B64_1]], implicit-def dead $scc ; GCN-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[COPY1]].sub1 ; GCN-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[COPY1]].sub0 ; GCN-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 ; GCN-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY5]], %subreg.sub0, killed [[COPY4]], %subreg.sub1, killed [[S_MOV_B32_3]], %subreg.sub2, killed [[S_MOV_B32_2]], %subreg.sub3 - ; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed [[S_XOR_B64_]], implicit $exec - ; GCN-NEXT: BUFFER_STORE_BYTE_OFFSET killed [[V_CNDMASK_B32_e64_]], killed [[REG_SEQUENCE]], 0, 0, 0, 0, implicit $exec :: (store (s8) into %ir.out.load, addrspace 1) + ; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, killed [[S_XOR_B64_1]], implicit $exec + ; GCN-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 1 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[V_CNDMASK_B32_e64_]] + ; GCN-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 killed [[COPY6]], killed [[S_MOV_B32_4]], implicit-def dead $scc + ; GCN-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_AND_B32_1]] + ; GCN-NEXT: BUFFER_STORE_BYTE_OFFSET killed [[COPY7]], killed [[REG_SEQUENCE]], 0, 0, 0, 0, implicit $exec :: (store (s8) into %ir.out.load, addrspace 1) ; GCN-NEXT: S_ENDPGM 0 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.2.out.else: - ; GCN-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[COPY1]].sub1 - ; GCN-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[COPY1]].sub0 - ; GCN-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GCN-NEXT: [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 - ; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY7]], %subreg.sub0, killed [[COPY6]], %subreg.sub1, killed [[S_MOV_B32_5]], %subreg.sub2, killed [[S_MOV_B32_4]], %subreg.sub3 - ; GCN-NEXT: [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[COPY3]], implicit $exec - ; GCN-NEXT: BUFFER_STORE_BYTE_OFFSET killed [[V_CNDMASK_B32_e64_1]], killed [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s8) into %ir.out.load, addrspace 1) + ; GCN-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[COPY1]].sub1 + ; GCN-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[COPY1]].sub0 + ; GCN-NEXT: [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 + ; GCN-NEXT: [[S_MOV_B32_6:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 + ; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[S_MOV_B32_6]], %subreg.sub2, killed [[S_MOV_B32_5]], %subreg.sub3 + ; GCN-NEXT: [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, [[COPY3]], implicit $exec + ; GCN-NEXT: [[S_MOV_B32_7:%[0-9]+]]:sreg_32 = S_MOV_B32 1 + ; GCN-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[V_CNDMASK_B32_e64_1]] + ; GCN-NEXT: [[S_AND_B32_2:%[0-9]+]]:sreg_32 = S_AND_B32 killed [[COPY10]], killed [[S_MOV_B32_7]], implicit-def dead $scc + ; GCN-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_AND_B32_2]] + ; GCN-NEXT: BUFFER_STORE_BYTE_OFFSET killed [[COPY11]], killed [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s8) into %ir.out.load, addrspace 1) ; GCN-NEXT: S_ENDPGM 0 entry: %0 = and i32 %x, 2 diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll index 8193ee4104ab..6a5b3bc42555 100644 --- a/llvm/test/CodeGen/AMDGPU/div_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll @@ -217,238 +217,261 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0: ; %bb.0: ; %_udiv-special-cases ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-O0-NEXT: v_mov_b32_e32 v20, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v2 +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v0 +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v2 +; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX9-O0-NEXT: s_waitcnt vmcnt(3) +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v4 +; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v0 +; GFX9-O0-NEXT: s_mov_b32 s4, 63 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v3 +; GFX9-O0-NEXT: v_ashrrev_i64 v[10:11], s4, v[10:11] +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v1 +; GFX9-O0-NEXT: v_ashrrev_i64 v[14:15], s4, v[12:13] +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v11 +; GFX9-O0-NEXT: v_xor_b32_e64 v0, v5, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v10 +; GFX9-O0-NEXT: v_xor_b32_e64 v10, v3, v4 +; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v9 +; GFX9-O0-NEXT: v_xor_b32_e64 v0, v5, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v8 +; GFX9-O0-NEXT: v_xor_b32_e64 v12, v3, v4 +; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v10 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v11 +; GFX9-O0-NEXT: v_sub_co_u32_e32 v9, vcc, v9, v3 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v4, vcc, v4, v5, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v12, vcc, v8, v3, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v5, vcc +; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v4 +; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v15 +; GFX9-O0-NEXT: v_xor_b32_e64 v0, v4, v0 +; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $vgpr1_vgpr2 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v14 +; GFX9-O0-NEXT: v_xor_b32_e64 v14, v2, v1 +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v7 +; GFX9-O0-NEXT: v_xor_b32_e64 v0, v4, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v6 +; GFX9-O0-NEXT: v_xor_b32_e64 v6, v2, v1 +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v6 +; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 killed $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v14 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v15 +; GFX9-O0-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v4, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v14, vcc, v6, v2, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v6, vcc, v1, v4, vcc +; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v7 +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v6 +; GFX9-O0-NEXT: v_xor_b32_e64 v4, v4, v5 +; GFX9-O0-NEXT: v_xor_b32_e64 v2, v2, v3 +; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2 +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v14 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v15 +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-O0-NEXT: ; kill: def $vgpr20 killed $vgpr20 def $vgpr20_vgpr21 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v13 +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v10 +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v15 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v1 +; GFX9-O0-NEXT: v_or_b32_e64 v2, v8, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v14 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-O0-NEXT: v_or_b32_e64 v0, v5, v4 +; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 +; GFX9-O0-NEXT: s_mov_b64 s[8:9], 0 +; GFX9-O0-NEXT: ; implicit-def: $vgpr30 : SGPR spill to VGPR lane +; GFX9-O0-NEXT: v_writelane_b32 v30, s8, 0 +; GFX9-O0-NEXT: v_writelane_b32 v30, s9, 1 +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[0:1], s[8:9] +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v10 +; GFX9-O0-NEXT: v_or_b32_e64 v7, v3, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v9 +; GFX9-O0-NEXT: v_or_b32_e64 v9, v2, v0 +; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v7 +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[6:7], v[9:10], s[8:9] +; GFX9-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] +; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v4 +; GFX9-O0-NEXT: s_mov_b32 s6, 32 +; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s6 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v6, v6 +; GFX9-O0-NEXT: v_min_u32_e64 v6, v4, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v7 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v5 +; GFX9-O0-NEXT: v_add_u32_e64 v5, v5, s6 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v8, v8 +; GFX9-O0-NEXT: v_min_u32_e64 v16, v5, v8 +; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v4 +; GFX9-O0-NEXT: s_mov_b64 s[10:11], 64 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v16 +; GFX9-O0-NEXT: s_mov_b32 s12, s10 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v17 +; GFX9-O0-NEXT: s_mov_b32 s7, s11 +; GFX9-O0-NEXT: v_add_co_u32_e64 v8, s[12:13], v8, s12 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, s7 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v5, s[12:13], v5, v9, s[12:13] ; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v9, v5 -; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v3 -; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v10 -; GFX9-O0-NEXT: v_mov_b32_e32 v19, v11 -; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0 -; GFX9-O0-NEXT: ; implicit-def: $vgpr30 : SGPR spill to VGPR lane -; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 0 -; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 1 -; GFX9-O0-NEXT: s_mov_b32 s10, s6 -; GFX9-O0-NEXT: v_writelane_b32 v30, s10, 2 -; GFX9-O0-NEXT: s_mov_b32 s11, s7 -; GFX9-O0-NEXT: v_writelane_b32 v30, s11, 3 -; GFX9-O0-NEXT: v_sub_co_u32_e32 v5, vcc, s10, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, s11 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v3, vcc, v0, v2, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v0, s10 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v4, vcc, v0, v14, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v0, s11 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v19, vcc -; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6 -; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX9-O0-NEXT: v_cmp_lt_i64_e64 s[4:5], v[10:11], s[4:5] -; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v2, v3, s[4:5] -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v5 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5] -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v3 -; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v5 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v19, v0, s[4:5] -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v14, v0, s[4:5] -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v9 -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v20 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v21 -; GFX9-O0-NEXT: v_sub_co_u32_e32 v17, vcc, s10, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v10, vcc, v4, v8, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v4, s10 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v11, vcc, v4, v13, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v4, vcc, v4, v9, vcc -; GFX9-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v10 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v18 -; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX9-O0-NEXT: v_cmp_lt_i64_e64 s[4:5], v[20:21], s[4:5] -; GFX9-O0-NEXT: v_cndmask_b32_e64 v10, v8, v10, s[4:5] -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v17 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[4:5] -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v10 -; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v12 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v8, v9, v4, s[4:5] -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v11 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v13, v4, s[4:5] -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-O0-NEXT: v_xor_b32_e64 v9, v9, v19 -; GFX9-O0-NEXT: v_xor_b32_e64 v13, v13, v14 -; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v9 -; GFX9-O0-NEXT: s_mov_b32 s4, 63 -; GFX9-O0-NEXT: v_ashrrev_i64 v[13:14], s4, v[13:14] -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v12 -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v11 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v17 -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v18 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v5 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v15 -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v16 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v12 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v18 -; GFX9-O0-NEXT: v_or_b32_e64 v9, v9, v13 -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v11 -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v17 -; GFX9-O0-NEXT: v_or_b32_e64 v13, v13, v14 -; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v9 -; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[13:14], s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v16 -; GFX9-O0-NEXT: v_or_b32_e64 v9, v9, v13 -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v15 -; GFX9-O0-NEXT: v_or_b32_e64 v13, v13, v14 -; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v9 -; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[13:14], s[6:7] -; GFX9-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[8:9], v[11:12], s[6:7] -; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v4 -; GFX9-O0-NEXT: s_mov_b32 s12, 32 -; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s12 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v8, v8 -; GFX9-O0-NEXT: v_min_u32_e64 v8, v4, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 +; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9] +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[12:13], v[14:15], s[12:13] +; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[12:13] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v10, s[14:15] +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v8 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v8, v6, v7, s[12:13] ; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v9 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v7, v7 -; GFX9-O0-NEXT: v_add_u32_e64 v7, v7, s12 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v10, v10 -; GFX9-O0-NEXT: v_min_u32_e64 v13, v7, v10 -; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v4 -; GFX9-O0-NEXT: s_mov_b64 s[14:15], 64 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v13 -; GFX9-O0-NEXT: s_mov_b32 s16, s14 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v14 -; GFX9-O0-NEXT: s_mov_b32 s13, s15 -; GFX9-O0-NEXT: v_add_co_u32_e64 v10, s[16:17], v10, s16 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, s13 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v7, s[16:17], v7, v11, s[16:17] -; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v11 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, v7, v12, s[8:9] -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v10 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[8:9] -; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7 -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[8:9], v[5:6], s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v5 ; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v0 -; GFX9-O0-NEXT: v_add_u32_e64 v5, v5, s12 +; GFX9-O0-NEXT: v_add_u32_e64 v5, v5, s6 ; GFX9-O0-NEXT: v_ffbh_u32_e64 v6, v1 ; GFX9-O0-NEXT: v_min_u32_e64 v5, v5, v6 ; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 ; GFX9-O0-NEXT: v_ffbh_u32_e64 v10, v2 -; GFX9-O0-NEXT: v_add_u32_e64 v10, v10, s12 +; GFX9-O0-NEXT: v_add_u32_e64 v10, v10, s6 ; GFX9-O0-NEXT: v_ffbh_u32_e64 v11, v3 -; GFX9-O0-NEXT: v_min_u32_e64 v11, v10, v11 -; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v11 -; GFX9-O0-NEXT: s_mov_b32 s12, s14 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v12 -; GFX9-O0-NEXT: s_mov_b32 s14, s15 -; GFX9-O0-NEXT: v_add_co_u32_e64 v10, s[12:13], v10, s12 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, s14 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v4, s[12:13], v4, v11, s[12:13] +; GFX9-O0-NEXT: v_min_u32_e64 v14, v10, v11 +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v14 +; GFX9-O0-NEXT: s_mov_b32 s6, s10 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v15 +; GFX9-O0-NEXT: s_mov_b32 s10, s11 +; GFX9-O0-NEXT: v_add_co_u32_e64 v10, s[6:7], v10, s6 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, s10 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v4, s[6:7], v4, v11, s[6:7] ; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v11, v4 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v11 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[8:9] +; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[6:7], v[12:13], s[6:7] +; GFX9-O0-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[10:11] ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v10 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[8:9] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[6:7] ; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v8 ; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr5_vgpr6 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 +; GFX9-O0-NEXT: s_mov_b32 s7, s8 +; GFX9-O0-NEXT: s_mov_b32 s14, s9 ; GFX9-O0-NEXT: v_sub_co_u32_e32 v4, vcc, v4, v7 ; GFX9-O0-NEXT: v_subb_co_u32_e32 v8, vcc, v5, v6, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v6, s10 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, s10 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, s7 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s7 ; GFX9-O0-NEXT: v_subb_co_u32_e32 v7, vcc, v5, v6, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v6, s11 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, s14 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s14 ; GFX9-O0-NEXT: v_subb_co_u32_e32 v6, vcc, v5, v6, vcc ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8 -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6 -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7 +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[6:7] +; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v4 +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[10:11], v[7:8], s[10:11] ; GFX9-O0-NEXT: s_mov_b64 s[12:13], 0x7f -; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[12:13] -; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15] -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[6:7] -; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[14:15] -; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9] +; GFX9-O0-NEXT: s_mov_b64 s[16:17], s[12:13] +; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[16:17], v[4:5], s[16:17] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[16:17] +; GFX9-O0-NEXT: s_mov_b64 s[16:17], s[8:9] +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[16:17], v[7:8], s[16:17] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[16:17] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[10:11] ; GFX9-O0-NEXT: v_and_b32_e64 v6, 1, v6 -; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[8:9], v6, 1 -; GFX9-O0-NEXT: s_or_b64 s[8:9], s[4:5], s[8:9] -; GFX9-O0-NEXT: s_mov_b64 s[14:15], -1 -; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[8:9] -; GFX9-O0-NEXT: s_xor_b64 s[4:5], s[4:5], s[14:15] +; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[10:11], v6, 1 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[10:11] +; GFX9-O0-NEXT: s_mov_b32 s6, 1 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, s6 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[4:5] +; GFX9-O0-NEXT: v_and_b32_e64 v6, 1, v6 +; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], v6, 1 +; GFX9-O0-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 -; GFX9-O0-NEXT: s_mov_b32 s14, s13 -; GFX9-O0-NEXT: v_xor_b32_e64 v6, v6, s14 +; GFX9-O0-NEXT: s_mov_b32 s15, s13 +; GFX9-O0-NEXT: v_xor_b32_e64 v6, v6, s15 +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13 ; GFX9-O0-NEXT: v_xor_b32_e64 v4, v4, s12 ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec @@ -461,23 +484,29 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_or_b32_e64 v4, v4, v5 ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6 -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[6:7], v[4:5], s[6:7] -; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9] -; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11 +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[4:5], s[8:9] +; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[10:11] +; GFX9-O0-NEXT: v_mov_b32_e32 v4, s14 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v1, v4, s[12:13] -; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9] -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10 +; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[10:11] +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[12:13] ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4 -; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9] -; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11 +; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[10:11] +; GFX9-O0-NEXT: v_mov_b32_e32 v4, s14 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v3, v4, s[12:13] -; GFX9-O0-NEXT: v_mov_b32_e32 v3, s10 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[8:9] +; GFX9-O0-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[10:11] ; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v4 -; GFX9-O0-NEXT: s_and_b64 s[6:7], s[4:5], s[6:7] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[8:9] +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[4:5] +; GFX9-O0-NEXT: v_and_b32_e64 v4, 1, v4 +; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, 1 +; GFX9-O0-NEXT: s_mov_b64 s[6:7], -1 +; GFX9-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill @@ -485,8 +514,8 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 s[4:5], exec -; GFX9-O0-NEXT: v_writelane_b32 v30, s4, 4 -; GFX9-O0-NEXT: v_writelane_b32 v30, s5, 5 +; GFX9-O0-NEXT: v_writelane_b32 v30, s4, 2 +; GFX9-O0-NEXT: v_writelane_b32 v30, s5, 3 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] @@ -499,64 +528,64 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 6 -; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 7 +; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 4 +; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 5 ; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-O0-NEXT: ; %bb.2: ; %Flow -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB0_5 ; GFX9-O0-NEXT: .LBB0_3: ; %Flow2 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 4 -; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 5 +; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 2 +; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 3 ; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB0_9 ; GFX9-O0-NEXT: .LBB0_4: ; %udiv-loop-exit -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b32 s4, 1 ; GFX9-O0-NEXT: s_waitcnt vmcnt(2) ; GFX9-O0-NEXT: v_lshlrev_b64 v[2:3], s4, v[0:1] @@ -593,30 +622,30 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 8 -; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 9 +; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 6 +; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 7 ; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB0_4 ; GFX9-O0-NEXT: .LBB0_6: ; %udiv-do-while ; GFX9-O0-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -624,32 +653,32 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s6, v30, 10 -; GFX9-O0-NEXT: v_readlane_b32 s7, v30, 11 -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_readlane_b32 s6, v30, 8 +; GFX9-O0-NEXT: v_readlane_b32 s7, v30, 9 +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b32 s4, 63 ; GFX9-O0-NEXT: s_waitcnt vmcnt(16) ; GFX9-O0-NEXT: v_lshrrev_b64 v[28:29], s4, v[2:3] @@ -780,57 +809,57 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v17, v3 ; GFX9-O0-NEXT: v_mov_b32_e32 v16, v2 -; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v17, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v16, v0 -; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v17, v15 ; GFX9-O0-NEXT: v_mov_b32_e32 v16, v14 -; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v17, v13 ; GFX9-O0-NEXT: v_mov_b32_e32 v16, v12 -; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 6 -; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 7 +; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 4 +; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 5 ; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 10 -; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 11 +; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 8 +; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 9 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX9-O0-NEXT: s_cbranch_execnz .LBB0_6 ; GFX9-O0-NEXT: s_branch .LBB0_1 @@ -838,14 +867,14 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload @@ -875,13 +904,17 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_lshrrev_b64 v[20:21], v5, v[14:15] ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v21 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v22, s[6:7] -; GFX9-O0-NEXT: s_mov_b32 s4, 0 -; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, s4 -; GFX9-O0-NEXT: v_mov_b32_e32 v22, v19 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v22, s[4:5] ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v20 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[6:7] +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7 +; GFX9-O0-NEXT: s_mov_b32 s4, 0 +; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, s4 +; GFX9-O0-NEXT: v_mov_b32_e32 v20, v19 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v20, s[4:5] +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v18 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[4:5] ; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec @@ -921,41 +954,41 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v12, s4 ; GFX9-O0-NEXT: v_mov_b32_e32 v15, s5 ; GFX9-O0-NEXT: v_mov_b32_e32 v14, s4 -; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_writelane_b32 v30, s4, 10 -; GFX9-O0-NEXT: v_writelane_b32 v30, s5, 11 +; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_writelane_b32 v30, s4, 8 +; GFX9-O0-NEXT: v_writelane_b32 v30, s5, 9 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB0_6 ; GFX9-O0-NEXT: .LBB0_8: ; %udiv-bb1 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 @@ -965,71 +998,82 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 1 ; GFX9-O0-NEXT: s_mov_b32 s5, s6 ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-O0-NEXT: s_mov_b32 s4, s7 +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1 ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0 ; GFX9-O0-NEXT: s_mov_b32 s8, s6 ; GFX9-O0-NEXT: s_mov_b32 s9, s7 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, s5 -; GFX9-O0-NEXT: v_add_co_u32_e32 v8, vcc, v3, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v4, s8 -; GFX9-O0-NEXT: v_addc_co_u32_e32 v0, vcc, v0, v4, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v4, s9 -; GFX9-O0-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v4, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-O0-NEXT: v_add_co_u32_e32 v8, vcc, v2, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s8 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v0, vcc, v0, v5, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s9 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc ; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v9, v1 ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v0 +; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b32 s4, 0x7f -; GFX9-O0-NEXT: v_sub_u32_e64 v2, s4, v3 +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_mov_b64 s[10:11], 0x7f +; GFX9-O0-NEXT: s_mov_b32 s5, s10 +; GFX9-O0-NEXT: s_mov_b32 s4, s11 +; GFX9-O0-NEXT: v_sub_co_u32_e32 v2, vcc, s5, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v4, vcc, v3, v4, vcc +; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v4 +; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $vgpr2_vgpr3 killed $exec ; GFX9-O0-NEXT: v_lshlrev_b64 v[4:5], v2, v[10:11] -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v5 -; GFX9-O0-NEXT: s_mov_b32 s4, 64 -; GFX9-O0-NEXT: v_sub_u32_e64 v13, s4, v2 -; GFX9-O0-NEXT: v_lshrrev_b64 v[13:14], v13, v[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v14 -; GFX9-O0-NEXT: v_or_b32_e64 v12, v12, v15 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v5 +; GFX9-O0-NEXT: s_mov_b32 s10, 64 +; GFX9-O0-NEXT: v_sub_u32_e64 v12, s10, v2 +; GFX9-O0-NEXT: v_lshrrev_b64 v[12:13], v12, v[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v13 +; GFX9-O0-NEXT: v_or_b32_e64 v3, v3, v14 ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v12 ; GFX9-O0-NEXT: v_or_b32_e64 v4, v4, v5 ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-O0-NEXT: v_mov_b32_e32 v14, v5 -; GFX9-O0-NEXT: v_cmp_lt_u32_e64 s[4:5], v2, s4 -; GFX9-O0-NEXT: s_mov_b32 s10, 63 -; GFX9-O0-NEXT: v_sub_u32_e64 v3, s10, v3 +; GFX9-O0-NEXT: v_cmp_lt_u32_e64 s[4:5], v2, s10 +; GFX9-O0-NEXT: v_sub_u32_e64 v3, v2, s10 ; GFX9-O0-NEXT: v_lshlrev_b64 v[12:13], v3, v[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v13 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v14, s[4:5] -; GFX9-O0-NEXT: s_mov_b32 s10, 0 -; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[10:11], v2, s10 -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v11 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v14, s[10:11] ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v4 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v12 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[4:5] +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v5 +; GFX9-O0-NEXT: s_mov_b32 s10, 0 +; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[10:11], v2, s10 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v11 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v12, s[10:11] +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v10 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[10:11] ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec @@ -1043,12 +1087,12 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v3, v6, s[4:5] ; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v2 -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v9 ; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3 @@ -1062,23 +1106,23 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 s[6:7], exec ; GFX9-O0-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] ; GFX9-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] -; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 8 -; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 9 +; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 6 +; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 7 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] @@ -1086,12 +1130,12 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_cbranch_execz .LBB0_5 ; GFX9-O0-NEXT: s_branch .LBB0_7 ; GFX9-O0-NEXT: .LBB0_9: ; %udiv-end -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) @@ -1131,7 +1175,7 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_lshrrev_b64 v[3:4], s4, v[3:4] ; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $vgpr3_vgpr4 killed $exec ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: s_setpc_b64 s[30:31] @@ -2449,11 +2493,11 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_or_b32_e64 v0, v5, v4 ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 -; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0 +; GFX9-O0-NEXT: s_mov_b64 s[8:9], 0 ; GFX9-O0-NEXT: ; implicit-def: $vgpr30 : SGPR spill to VGPR lane -; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 0 -; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 1 -; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[0:1], s[6:7] +; GFX9-O0-NEXT: v_writelane_b32 v30, s8, 0 +; GFX9-O0-NEXT: v_writelane_b32 v30, s9, 1 +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[0:1], s[8:9] ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v13 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v10 ; GFX9-O0-NEXT: v_or_b32_e64 v7, v3, v1 @@ -2462,11 +2506,11 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_or_b32_e64 v9, v2, v0 ; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v10, v7 -; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[9:10], s[6:7] -; GFX9-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[6:7], v[9:10], s[8:9] +; GFX9-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] ; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v4 -; GFX9-O0-NEXT: s_mov_b32 s8, 32 -; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s8 +; GFX9-O0-NEXT: s_mov_b32 s6, 32 +; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s6 ; GFX9-O0-NEXT: v_ffbh_u32_e64 v6, v6 ; GFX9-O0-NEXT: v_min_u32_e64 v6, v4, v6 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, 0 @@ -2474,7 +2518,7 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v4 ; GFX9-O0-NEXT: v_mov_b32_e32 v10, v7 ; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v5 -; GFX9-O0-NEXT: v_add_u32_e64 v5, v5, s8 +; GFX9-O0-NEXT: v_add_u32_e64 v5, v5, s6 ; GFX9-O0-NEXT: v_ffbh_u32_e64 v8, v8 ; GFX9-O0-NEXT: v_min_u32_e64 v16, v5, v8 ; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec @@ -2483,14 +2527,14 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v8, v16 ; GFX9-O0-NEXT: s_mov_b32 s12, s10 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v17 -; GFX9-O0-NEXT: s_mov_b32 s9, s11 +; GFX9-O0-NEXT: s_mov_b32 s7, s11 ; GFX9-O0-NEXT: v_add_co_u32_e64 v8, s[12:13], v8, s12 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, s9 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, s7 ; GFX9-O0-NEXT: v_addc_co_u32_e64 v5, s[12:13], v5, v9, s[12:13] ; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v9, v5 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 -; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[6:7] +; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9] ; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[12:13], v[14:15], s[12:13] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v10, s[12:13] ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 @@ -2499,75 +2543,87 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v9, v5 ; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v0 -; GFX9-O0-NEXT: v_add_u32_e64 v5, v5, s8 +; GFX9-O0-NEXT: v_add_u32_e64 v5, v5, s6 ; GFX9-O0-NEXT: v_ffbh_u32_e64 v6, v1 ; GFX9-O0-NEXT: v_min_u32_e64 v5, v5, v6 ; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 ; GFX9-O0-NEXT: v_ffbh_u32_e64 v10, v2 -; GFX9-O0-NEXT: v_add_u32_e64 v10, v10, s8 +; GFX9-O0-NEXT: v_add_u32_e64 v10, v10, s6 ; GFX9-O0-NEXT: v_ffbh_u32_e64 v11, v3 ; GFX9-O0-NEXT: v_min_u32_e64 v14, v10, v11 ; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v15, v4 ; GFX9-O0-NEXT: v_mov_b32_e32 v10, v14 -; GFX9-O0-NEXT: s_mov_b32 s8, s10 +; GFX9-O0-NEXT: s_mov_b32 s6, s10 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v15 ; GFX9-O0-NEXT: s_mov_b32 s10, s11 -; GFX9-O0-NEXT: v_add_co_u32_e64 v10, s[8:9], v10, s8 +; GFX9-O0-NEXT: v_add_co_u32_e64 v10, s[6:7], v10, s6 ; GFX9-O0-NEXT: v_mov_b32_e32 v11, s10 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v4, s[8:9], v4, v11, s[8:9] +; GFX9-O0-NEXT: v_addc_co_u32_e64 v4, s[6:7], v4, v11, s[6:7] ; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v11, v4 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v11 -; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[8:9], v[12:13], s[8:9] -; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[8:9] +; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[6:7], v[12:13], s[6:7] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v10 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[8:9] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[6:7] ; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v8 ; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr5_vgpr6 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 -; GFX9-O0-NEXT: s_mov_b32 s10, s6 -; GFX9-O0-NEXT: s_mov_b32 s11, s7 +; GFX9-O0-NEXT: s_mov_b32 s7, s8 +; GFX9-O0-NEXT: s_mov_b32 s14, s9 ; GFX9-O0-NEXT: v_sub_co_u32_e32 v4, vcc, v4, v7 ; GFX9-O0-NEXT: v_subb_co_u32_e32 v8, vcc, v5, v6, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v6, s10 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, s10 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, s7 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s7 ; GFX9-O0-NEXT: v_subb_co_u32_e32 v7, vcc, v5, v6, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v6, s11 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, s14 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s14 ; GFX9-O0-NEXT: v_subb_co_u32_e32 v6, vcc, v5, v6, vcc ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8 -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6 -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7 +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[6:7] +; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v4 +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[10:11], v[7:8], s[10:11] ; GFX9-O0-NEXT: s_mov_b64 s[12:13], 0x7f -; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[12:13] -; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15] -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[6:7] -; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[14:15] -; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9] +; GFX9-O0-NEXT: s_mov_b64 s[16:17], s[12:13] +; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[16:17], v[4:5], s[16:17] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[16:17] +; GFX9-O0-NEXT: s_mov_b64 s[16:17], s[8:9] +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[16:17], v[7:8], s[16:17] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[16:17] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[10:11] ; GFX9-O0-NEXT: v_and_b32_e64 v6, 1, v6 -; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[8:9], v6, 1 -; GFX9-O0-NEXT: s_or_b64 s[8:9], s[4:5], s[8:9] -; GFX9-O0-NEXT: s_mov_b64 s[4:5], -1 -; GFX9-O0-NEXT: s_xor_b64 s[4:5], s[8:9], s[4:5] +; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[10:11], v6, 1 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[10:11] +; GFX9-O0-NEXT: s_mov_b32 s6, 1 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, s6 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[4:5] +; GFX9-O0-NEXT: v_and_b32_e64 v6, 1, v6 +; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], v6, 1 +; GFX9-O0-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 -; GFX9-O0-NEXT: s_mov_b32 s14, s13 -; GFX9-O0-NEXT: v_xor_b32_e64 v6, v6, s14 +; GFX9-O0-NEXT: s_mov_b32 s15, s13 +; GFX9-O0-NEXT: v_xor_b32_e64 v6, v6, s15 +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13 ; GFX9-O0-NEXT: v_xor_b32_e64 v4, v4, s12 ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec @@ -2580,20 +2636,29 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_or_b32_e64 v4, v4, v5 ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6 -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[6:7], v[4:5], s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v1, v4, s[8:9] -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[8:9] +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[4:5], s[8:9] +; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[10:11] +; GFX9-O0-NEXT: v_mov_b32_e32 v4, s14 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v1, v4, s[12:13] +; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[10:11] +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[12:13] ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v3, v4, s[8:9] -; GFX9-O0-NEXT: v_mov_b32_e32 v3, s10 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[8:9] +; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[10:11] +; GFX9-O0-NEXT: v_mov_b32_e32 v4, s14 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v3, v4, s[12:13] +; GFX9-O0-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[10:11] ; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v4 -; GFX9-O0-NEXT: s_and_b64 s[6:7], s[4:5], s[6:7] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[8:9] +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[4:5] +; GFX9-O0-NEXT: v_and_b32_e64 v4, 1, v4 +; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, 1 +; GFX9-O0-NEXT: s_mov_b64 s[6:7], -1 +; GFX9-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill @@ -2603,17 +2668,17 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_mov_b64 s[4:5], exec ; GFX9-O0-NEXT: v_writelane_b32 v30, s4, 2 ; GFX9-O0-NEXT: v_writelane_b32 v30, s5, 3 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O0-NEXT: s_cbranch_execz .LBB1_3 ; GFX9-O0-NEXT: s_branch .LBB1_8 ; GFX9-O0-NEXT: .LBB1_1: ; %Flow -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 4 ; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 5 @@ -2645,9 +2710,9 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB1_5 ; GFX9-O0-NEXT: .LBB1_3: ; %Flow2 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 2 ; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 3 @@ -2705,9 +2770,9 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB1_3 ; GFX9-O0-NEXT: .LBB1_5: ; %Flow1 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 6 ; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 7 @@ -2736,9 +2801,9 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_branch .LBB1_4 ; GFX9-O0-NEXT: .LBB1_6: ; %udiv-do-while ; GFX9-O0-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_readlane_b32 s6, v30, 8 ; GFX9-O0-NEXT: v_readlane_b32 s7, v30, 9 @@ -2920,9 +2985,9 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5] ; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 8 ; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 9 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill @@ -2951,9 +3016,9 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_cbranch_execnz .LBB1_6 ; GFX9-O0-NEXT: s_branch .LBB1_1 ; GFX9-O0-NEXT: .LBB1_7: ; %udiv-preheader -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload @@ -2991,13 +3056,17 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_lshrrev_b64 v[20:21], v5, v[14:15] ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v21 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v22, s[6:7] -; GFX9-O0-NEXT: s_mov_b32 s4, 0 -; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, s4 -; GFX9-O0-NEXT: v_mov_b32_e32 v22, v19 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v22, s[4:5] ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v20 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[6:7] +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7 +; GFX9-O0-NEXT: s_mov_b32 s4, 0 +; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, s4 +; GFX9-O0-NEXT: v_mov_b32_e32 v20, v19 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v20, s[4:5] +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v18 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[4:5] ; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec @@ -3045,9 +3114,9 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_writelane_b32 v30, s4, 8 ; GFX9-O0-NEXT: v_writelane_b32 v30, s5, 9 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill @@ -3074,78 +3143,89 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB1_6 ; GFX9-O0-NEXT: .LBB1_8: ; %udiv-bb1 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 1 ; GFX9-O0-NEXT: s_mov_b32 s5, s6 ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-O0-NEXT: s_mov_b32 s4, s7 +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1 ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0 ; GFX9-O0-NEXT: s_mov_b32 s8, s6 ; GFX9-O0-NEXT: s_mov_b32 s9, s7 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, s5 -; GFX9-O0-NEXT: v_add_co_u32_e32 v8, vcc, v3, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v4, s8 -; GFX9-O0-NEXT: v_addc_co_u32_e32 v0, vcc, v0, v4, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v4, s9 -; GFX9-O0-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v4, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-O0-NEXT: v_add_co_u32_e32 v8, vcc, v2, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s8 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v0, vcc, v0, v5, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s9 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc ; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v9, v1 ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v0 +; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b32 s4, 0x7f -; GFX9-O0-NEXT: v_sub_u32_e64 v2, s4, v3 +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_mov_b64 s[10:11], 0x7f +; GFX9-O0-NEXT: s_mov_b32 s5, s10 +; GFX9-O0-NEXT: s_mov_b32 s4, s11 +; GFX9-O0-NEXT: v_sub_co_u32_e32 v2, vcc, s5, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v4, vcc, v3, v4, vcc +; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v4 +; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $vgpr2_vgpr3 killed $exec ; GFX9-O0-NEXT: v_lshlrev_b64 v[4:5], v2, v[10:11] -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v5 -; GFX9-O0-NEXT: s_mov_b32 s4, 64 -; GFX9-O0-NEXT: v_sub_u32_e64 v13, s4, v2 -; GFX9-O0-NEXT: v_lshrrev_b64 v[13:14], v13, v[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v14 -; GFX9-O0-NEXT: v_or_b32_e64 v12, v12, v15 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v5 +; GFX9-O0-NEXT: s_mov_b32 s10, 64 +; GFX9-O0-NEXT: v_sub_u32_e64 v12, s10, v2 +; GFX9-O0-NEXT: v_lshrrev_b64 v[12:13], v12, v[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v13 +; GFX9-O0-NEXT: v_or_b32_e64 v3, v3, v14 ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v12 ; GFX9-O0-NEXT: v_or_b32_e64 v4, v4, v5 ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-O0-NEXT: v_mov_b32_e32 v14, v5 -; GFX9-O0-NEXT: v_cmp_lt_u32_e64 s[4:5], v2, s4 -; GFX9-O0-NEXT: s_mov_b32 s10, 63 -; GFX9-O0-NEXT: v_sub_u32_e64 v3, s10, v3 +; GFX9-O0-NEXT: v_cmp_lt_u32_e64 s[4:5], v2, s10 +; GFX9-O0-NEXT: v_sub_u32_e64 v3, v2, s10 ; GFX9-O0-NEXT: v_lshlrev_b64 v[12:13], v3, v[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v13 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v14, s[4:5] -; GFX9-O0-NEXT: s_mov_b32 s10, 0 -; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[10:11], v2, s10 -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v11 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v14, s[10:11] ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v4 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v12 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[4:5] +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v5 +; GFX9-O0-NEXT: s_mov_b32 s10, 0 +; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[10:11], v2, s10 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v11 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v12, s[10:11] +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v10 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[10:11] ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec @@ -3195,9 +3275,9 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] ; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 6 ; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 7 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O0-NEXT: s_cbranch_execz .LBB1_5 ; GFX9-O0-NEXT: s_branch .LBB1_7 @@ -4264,53 +4344,56 @@ define i128 @v_sdiv_i128_v_pow2k(i128 %lhs) { ; GFX9-O0-LABEL: v_sdiv_i128_v_pow2k: ; GFX9-O0: ; %bb.0: ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v3 ; GFX9-O0-NEXT: s_mov_b32 s4, 63 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-O0-NEXT: v_ashrrev_i64 v[6:7], s4, v[6:7] -; GFX9-O0-NEXT: s_mov_b32 s5, 31 -; GFX9-O0-NEXT: v_lshrrev_b64 v[6:7], s5, v[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v9 +; GFX9-O0-NEXT: v_ashrrev_i64 v[2:3], s4, v[2:3] +; GFX9-O0-NEXT: s_mov_b32 s4, 31 +; GFX9-O0-NEXT: v_lshrrev_b64 v[6:7], s4, v[2:3] +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v0 +; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $vgpr0_vgpr1 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v9 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v7 ; GFX9-O0-NEXT: s_mov_b64 s[8:9], 0 ; GFX9-O0-NEXT: s_mov_b32 s6, s8 -; GFX9-O0-NEXT: s_mov_b32 s4, s9 -; GFX9-O0-NEXT: v_add_co_u32_e32 v0, vcc, v0, v5 -; GFX9-O0-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v4, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v4, s6 -; GFX9-O0-NEXT: v_addc_co_u32_e32 v5, vcc, v2, v4, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-O0-NEXT: v_addc_co_u32_e32 v2, vcc, v1, v2, vcc -; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 +; GFX9-O0-NEXT: s_mov_b32 s5, s9 +; GFX9-O0-NEXT: v_add_co_u32_e32 v5, vcc, v4, v5 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v3, vcc, v2, v3, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v0, vcc, v0, v2, vcc ; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4 +; GFX9-O0-NEXT: v_lshlrev_b64 v[1:2], s4, v[0:1] +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v2 ; GFX9-O0-NEXT: s_mov_b32 s4, 33 -; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s4, v[0:1] -; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec -; GFX9-O0-NEXT: v_lshl_or_b32 v0, v2, s5, v0 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v6 +; GFX9-O0-NEXT: v_lshrrev_b64 v[5:6], s4, v[5:6] +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v6 +; GFX9-O0-NEXT: v_or_b32_e64 v0, v0, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-O0-NEXT: v_ashrrev_i64 v[3:4], s4, v[3:4] -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v6 -; GFX9-O0-NEXT: s_mov_b32 s4, 1 -; GFX9-O0-NEXT: v_alignbit_b32 v1, v1, v2, s4 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-O0-NEXT: s_mov_b32 s4, 32 +; GFX9-O0-NEXT: v_lshrrev_b64 v[1:2], s4, v[1:2] +; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $vgpr1_vgpr2 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 ; GFX9-O0-NEXT: v_lshrrev_b64 v[3:4], s4, v[3:4] ; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $vgpr3_vgpr4 killed $exec ; GFX9-O0-NEXT: s_setpc_b64 s[30:31] @@ -4403,26 +4486,35 @@ define i128 @v_sdiv_exact_i128_v_pow2k(i128 %lhs) { ; GFX9-O0-LABEL: v_sdiv_exact_i128_v_pow2k: ; GFX9-O0: ; %bb.0: ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v1 -; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v0 +; GFX9-O0-NEXT: s_mov_b32 s4, 31 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4 +; GFX9-O0-NEXT: v_lshlrev_b64 v[1:2], s4, v[0:1] +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v2 ; GFX9-O0-NEXT: s_mov_b32 s4, 33 -; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s4, v[0:1] -; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec -; GFX9-O0-NEXT: s_mov_b32 s5, 31 -; GFX9-O0-NEXT: v_lshl_or_b32 v0, v2, s5, v0 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v6 +; GFX9-O0-NEXT: v_lshrrev_b64 v[5:6], s4, v[5:6] +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v6 +; GFX9-O0-NEXT: v_or_b32_e64 v0, v0, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-O0-NEXT: v_ashrrev_i64 v[3:4], s4, v[3:4] -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v6 -; GFX9-O0-NEXT: s_mov_b32 s4, 1 -; GFX9-O0-NEXT: v_alignbit_b32 v1, v1, v2, s4 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-O0-NEXT: s_mov_b32 s4, 32 +; GFX9-O0-NEXT: v_lshrrev_b64 v[1:2], s4, v[1:2] +; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $vgpr1_vgpr2 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 ; GFX9-O0-NEXT: v_lshrrev_b64 v[3:4], s4, v[3:4] ; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $vgpr3_vgpr4 killed $exec ; GFX9-O0-NEXT: s_setpc_b64 s[30:31] @@ -4481,26 +4573,37 @@ define i128 @v_udiv_i128_v_pow2k(i128 %lhs) { ; GFX9-O0-LABEL: v_udiv_i128_v_pow2k: ; GFX9-O0: ; %bb.0: ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v1 -; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v0 +; GFX9-O0-NEXT: s_mov_b32 s4, 31 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4 +; GFX9-O0-NEXT: v_lshlrev_b64 v[1:2], s4, v[0:1] +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v2 ; GFX9-O0-NEXT: s_mov_b32 s4, 33 -; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s4, v[0:1] -; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec -; GFX9-O0-NEXT: s_mov_b32 s5, 31 -; GFX9-O0-NEXT: v_lshl_or_b32 v0, v4, s5, v0 +; GFX9-O0-NEXT: v_lshrrev_b64 v[5:6], s4, v[5:6] +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v6 +; GFX9-O0-NEXT: v_or_b32_e64 v0, v0, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v6 -; GFX9-O0-NEXT: v_lshrrev_b64 v[2:3], s4, v[1:2] -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v6 -; GFX9-O0-NEXT: s_mov_b32 s4, 1 -; GFX9-O0-NEXT: v_alignbit_b32 v1, v1, v4, s4 -; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $vgpr2_vgpr3 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-O0-NEXT: v_lshrrev_b64 v[3:4], s4, v[3:4] +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-O0-NEXT: s_mov_b32 s4, 32 +; GFX9-O0-NEXT: v_lshrrev_b64 v[1:2], s4, v[1:2] +; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $vgpr1_vgpr2 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 +; GFX9-O0-NEXT: v_lshrrev_b64 v[3:4], s4, v[3:4] +; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $vgpr3_vgpr4 killed $exec ; GFX9-O0-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-G-LABEL: v_udiv_i128_v_pow2k: @@ -4554,26 +4657,37 @@ define i128 @v_udiv_exact_i128_v_pow2k(i128 %lhs) { ; GFX9-O0-LABEL: v_udiv_exact_i128_v_pow2k: ; GFX9-O0: ; %bb.0: ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v1 -; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v0 +; GFX9-O0-NEXT: s_mov_b32 s4, 31 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4 +; GFX9-O0-NEXT: v_lshlrev_b64 v[1:2], s4, v[0:1] +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v2 ; GFX9-O0-NEXT: s_mov_b32 s4, 33 -; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s4, v[0:1] -; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec -; GFX9-O0-NEXT: s_mov_b32 s5, 31 -; GFX9-O0-NEXT: v_lshl_or_b32 v0, v4, s5, v0 +; GFX9-O0-NEXT: v_lshrrev_b64 v[5:6], s4, v[5:6] +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v6 +; GFX9-O0-NEXT: v_or_b32_e64 v0, v0, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v6 -; GFX9-O0-NEXT: v_lshrrev_b64 v[2:3], s4, v[1:2] -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v6 -; GFX9-O0-NEXT: s_mov_b32 s4, 1 -; GFX9-O0-NEXT: v_alignbit_b32 v1, v1, v4, s4 -; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $vgpr2_vgpr3 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-O0-NEXT: v_lshrrev_b64 v[3:4], s4, v[3:4] +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-O0-NEXT: s_mov_b32 s4, 32 +; GFX9-O0-NEXT: v_lshrrev_b64 v[1:2], s4, v[1:2] +; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $vgpr1_vgpr2 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 +; GFX9-O0-NEXT: v_lshrrev_b64 v[3:4], s4, v[3:4] +; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $vgpr3_vgpr4 killed $exec ; GFX9-O0-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-G-LABEL: v_udiv_exact_i128_v_pow2k: diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll index 975695b03c11..bf84e32de657 100644 --- a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll @@ -25,6 +25,10 @@ define amdgpu_kernel void @float4_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-O0-LABEL: float4_extelt: ; GCN-O0: ; %bb.0: ; %entry ; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0x2c +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-O0-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN-O0-NEXT: s_mov_b32 s3, 4.0 ; GCN-O0-NEXT: s_mov_b32 s4, 2.0 @@ -72,6 +76,10 @@ define amdgpu_kernel void @int4_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-O0-LABEL: int4_extelt: ; GCN-O0: ; %bb.0: ; %entry ; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0x2c +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-O0-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN-O0-NEXT: s_mov_b32 s3, 4 ; GCN-O0-NEXT: s_mov_b32 s4, 2 @@ -125,6 +133,10 @@ define amdgpu_kernel void @double4_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-O0-LABEL: double4_extelt: ; GCN-O0: ; %bb.0: ; %entry ; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0x2c +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-O0-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN-O0-NEXT: s_mov_b32 s3, 0x40100a3d ; GCN-O0-NEXT: s_mov_b32 s4, 0x70a3d70a @@ -158,9 +170,9 @@ define amdgpu_kernel void @double4_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-O0-NEXT: s_mov_b32 s9, s13 ; GCN-O0-NEXT: s_mov_b32 s10, s12 ; GCN-O0-NEXT: s_mov_b32 s11, s3 -; GCN-O0-NEXT: s_mov_b32 s3, 1 +; GCN-O0-NEXT: s_mov_b32 s3, 2 ; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: s_lshl_b32 s2, s2, s3 +; GCN-O0-NEXT: s_mul_i32 s2, s2, s3 ; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 ; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 ; GCN-O0-NEXT: v_mov_b32_e32 v2, s6 @@ -170,17 +182,19 @@ define amdgpu_kernel void @double4_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-O0-NEXT: v_mov_b32_e32 v6, s10 ; GCN-O0-NEXT: v_mov_b32_e32 v7, s11 ; GCN-O0-NEXT: s_mov_b32 m0, s2 -; GCN-O0-NEXT: v_movrels_b32_e32 v0, v1 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s5 -; GCN-O0-NEXT: v_mov_b32_e32 v3, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v4, s7 -; GCN-O0-NEXT: v_mov_b32_e32 v5, s8 -; GCN-O0-NEXT: v_mov_b32_e32 v6, s9 -; GCN-O0-NEXT: v_mov_b32_e32 v7, s10 -; GCN-O0-NEXT: v_mov_b32_e32 v8, s11 +; GCN-O0-NEXT: v_movrels_b32_e32 v2, v0 +; GCN-O0-NEXT: s_mov_b32 s3, 1 +; GCN-O0-NEXT: s_add_i32 s2, s2, s3 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v4, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v5, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v6, s7 +; GCN-O0-NEXT: v_mov_b32_e32 v7, s8 +; GCN-O0-NEXT: v_mov_b32_e32 v8, s9 +; GCN-O0-NEXT: v_mov_b32_e32 v9, s10 +; GCN-O0-NEXT: v_mov_b32_e32 v10, s11 ; GCN-O0-NEXT: s_mov_b32 m0, s2 -; GCN-O0-NEXT: v_movrels_b32_e32 v2, v1 +; GCN-O0-NEXT: v_movrels_b32_e32 v0, v3 ; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GCN-O0-NEXT: v_mov_b32_e32 v3, v0 ; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 @@ -228,6 +242,10 @@ define amdgpu_kernel void @double5_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-O0-LABEL: double5_extelt: ; GCN-O0: ; %bb.0: ; %entry ; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0x2c +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-O0-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN-O0-NEXT: s_mov_b32 s3, 0x40140a3d ; GCN-O0-NEXT: s_mov_b32 s4, 0x70a3d70a @@ -286,9 +304,9 @@ define amdgpu_kernel void @double5_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-O0-NEXT: s_mov_b32 s17, s21 ; GCN-O0-NEXT: s_mov_b32 s18, s20 ; GCN-O0-NEXT: s_mov_b32 s19, s3 -; GCN-O0-NEXT: s_mov_b32 s3, 1 +; GCN-O0-NEXT: s_mov_b32 s3, 2 ; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: s_lshl_b32 s2, s2, s3 +; GCN-O0-NEXT: s_mul_i32 s2, s2, s3 ; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 ; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 ; GCN-O0-NEXT: v_mov_b32_e32 v2, s6 @@ -306,25 +324,27 @@ define amdgpu_kernel void @double5_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-O0-NEXT: v_mov_b32_e32 v14, s18 ; GCN-O0-NEXT: v_mov_b32_e32 v15, s19 ; GCN-O0-NEXT: s_mov_b32 m0, s2 -; GCN-O0-NEXT: v_movrels_b32_e32 v0, v1 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s5 -; GCN-O0-NEXT: v_mov_b32_e32 v3, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v4, s7 -; GCN-O0-NEXT: v_mov_b32_e32 v5, s8 -; GCN-O0-NEXT: v_mov_b32_e32 v6, s9 -; GCN-O0-NEXT: v_mov_b32_e32 v7, s10 -; GCN-O0-NEXT: v_mov_b32_e32 v8, s11 -; GCN-O0-NEXT: v_mov_b32_e32 v9, s12 -; GCN-O0-NEXT: v_mov_b32_e32 v10, s13 -; GCN-O0-NEXT: v_mov_b32_e32 v11, s14 -; GCN-O0-NEXT: v_mov_b32_e32 v12, s15 -; GCN-O0-NEXT: v_mov_b32_e32 v13, s16 -; GCN-O0-NEXT: v_mov_b32_e32 v14, s17 -; GCN-O0-NEXT: v_mov_b32_e32 v15, s18 -; GCN-O0-NEXT: v_mov_b32_e32 v16, s19 +; GCN-O0-NEXT: v_movrels_b32_e32 v2, v0 +; GCN-O0-NEXT: s_mov_b32 s3, 1 +; GCN-O0-NEXT: s_add_i32 s2, s2, s3 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v4, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v5, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v6, s7 +; GCN-O0-NEXT: v_mov_b32_e32 v7, s8 +; GCN-O0-NEXT: v_mov_b32_e32 v8, s9 +; GCN-O0-NEXT: v_mov_b32_e32 v9, s10 +; GCN-O0-NEXT: v_mov_b32_e32 v10, s11 +; GCN-O0-NEXT: v_mov_b32_e32 v11, s12 +; GCN-O0-NEXT: v_mov_b32_e32 v12, s13 +; GCN-O0-NEXT: v_mov_b32_e32 v13, s14 +; GCN-O0-NEXT: v_mov_b32_e32 v14, s15 +; GCN-O0-NEXT: v_mov_b32_e32 v15, s16 +; GCN-O0-NEXT: v_mov_b32_e32 v16, s17 +; GCN-O0-NEXT: v_mov_b32_e32 v17, s18 +; GCN-O0-NEXT: v_mov_b32_e32 v18, s19 ; GCN-O0-NEXT: s_mov_b32 m0, s2 -; GCN-O0-NEXT: v_movrels_b32_e32 v2, v1 +; GCN-O0-NEXT: v_movrels_b32_e32 v0, v3 ; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GCN-O0-NEXT: v_mov_b32_e32 v3, v0 ; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 @@ -355,14 +375,17 @@ define amdgpu_kernel void @half4_extelt(ptr addrspace(1) %out, i32 %sel) { ; ; GCN-O0-LABEL: half4_extelt: ; GCN-O0: ; %bb.0: ; %entry +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0x2c ; GCN-O0-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GCN-O0-NEXT: s_load_dword s4, s[4:5], 0x2c ; GCN-O0-NEXT: s_mov_b32 s5, 0x44004200 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) ; GCN-O0-NEXT: s_mov_b32 s0, 0x40003c00 ; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; GCN-O0-NEXT: s_mov_b32 s1, s5 ; GCN-O0-NEXT: s_mov_b32 s5, 4 -; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) ; GCN-O0-NEXT: s_lshl_b32 s4, s4, s5 ; GCN-O0-NEXT: s_lshr_b64 s[0:1], s[0:1], s4 ; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -394,6 +417,10 @@ define amdgpu_kernel void @float2_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-O0-LABEL: float2_extelt: ; GCN-O0: ; %bb.0: ; %entry ; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0x2c +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-O0-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN-O0-NEXT: s_mov_b32 s3, 1.0 ; GCN-O0-NEXT: s_mov_b32 s4, 0 @@ -435,6 +462,10 @@ define amdgpu_kernel void @double2_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-O0-LABEL: double2_extelt: ; GCN-O0: ; %bb.0: ; %entry ; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0x2c +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-O0-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN-O0-NEXT: s_mov_b32 s3, 0x3ff028f5 ; GCN-O0-NEXT: s_mov_b32 s4, 0xc28f5c29 @@ -452,21 +483,23 @@ define amdgpu_kernel void @double2_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-O0-NEXT: s_mov_b32 s5, s9 ; GCN-O0-NEXT: s_mov_b32 s6, s8 ; GCN-O0-NEXT: s_mov_b32 s7, s3 -; GCN-O0-NEXT: s_mov_b32 s3, 1 +; GCN-O0-NEXT: s_mov_b32 s3, 2 ; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: s_lshl_b32 s2, s2, s3 +; GCN-O0-NEXT: s_mul_i32 s2, s2, s3 ; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 ; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 ; GCN-O0-NEXT: v_mov_b32_e32 v2, s6 ; GCN-O0-NEXT: v_mov_b32_e32 v3, s7 ; GCN-O0-NEXT: s_mov_b32 m0, s2 -; GCN-O0-NEXT: v_movrels_b32_e32 v0, v1 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s5 -; GCN-O0-NEXT: v_mov_b32_e32 v3, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v4, s7 +; GCN-O0-NEXT: v_movrels_b32_e32 v2, v0 +; GCN-O0-NEXT: s_mov_b32 s3, 1 +; GCN-O0-NEXT: s_add_i32 s2, s2, s3 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v4, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v5, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v6, s7 ; GCN-O0-NEXT: s_mov_b32 m0, s2 -; GCN-O0-NEXT: v_movrels_b32_e32 v2, v1 +; GCN-O0-NEXT: v_movrels_b32_e32 v0, v3 ; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GCN-O0-NEXT: v_mov_b32_e32 v3, v0 ; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 @@ -522,6 +555,10 @@ define amdgpu_kernel void @half8_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-O0-LABEL: half8_extelt: ; GCN-O0: ; %bb.0: ; %entry ; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0x2c +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-O0-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN-O0-NEXT: s_mov_b32 s3, 1 ; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) @@ -606,7 +643,11 @@ define amdgpu_kernel void @short8_extelt(ptr addrspace(1) %out, i32 %sel) { ; ; GCN-O0-LABEL: short8_extelt: ; GCN-O0: ; %bb.0: ; %entry +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0x2c ; GCN-O0-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) ; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0x2c ; GCN-O0-NEXT: s_mov_b32 s1, 1 ; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) @@ -666,6 +707,10 @@ define amdgpu_kernel void @float8_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-O0-LABEL: float8_extelt: ; GCN-O0: ; %bb.0: ; %entry ; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0x2c +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-O0-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN-O0-NEXT: s_mov_b32 s3, 0x41000000 ; GCN-O0-NEXT: s_mov_b32 s4, 0x40e00000 @@ -753,6 +798,10 @@ define amdgpu_kernel void @double8_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-O0-LABEL: double8_extelt: ; GCN-O0: ; %bb.0: ; %entry ; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0x2c +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-O0-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN-O0-NEXT: s_mov_b32 s4, 0 ; GCN-O0-NEXT: s_mov_b32 s5, 0x40200000 @@ -799,9 +848,9 @@ define amdgpu_kernel void @double8_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-O0-NEXT: s_mov_b32 s17, s21 ; GCN-O0-NEXT: s_mov_b32 s18, s20 ; GCN-O0-NEXT: s_mov_b32 s19, s3 -; GCN-O0-NEXT: s_mov_b32 s3, 1 +; GCN-O0-NEXT: s_mov_b32 s3, 2 ; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: s_lshl_b32 s2, s2, s3 +; GCN-O0-NEXT: s_mul_i32 s2, s2, s3 ; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 ; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 ; GCN-O0-NEXT: v_mov_b32_e32 v2, s6 @@ -819,25 +868,27 @@ define amdgpu_kernel void @double8_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-O0-NEXT: v_mov_b32_e32 v14, s18 ; GCN-O0-NEXT: v_mov_b32_e32 v15, s19 ; GCN-O0-NEXT: s_mov_b32 m0, s2 -; GCN-O0-NEXT: v_movrels_b32_e32 v0, v1 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s5 -; GCN-O0-NEXT: v_mov_b32_e32 v3, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v4, s7 -; GCN-O0-NEXT: v_mov_b32_e32 v5, s8 -; GCN-O0-NEXT: v_mov_b32_e32 v6, s9 -; GCN-O0-NEXT: v_mov_b32_e32 v7, s10 -; GCN-O0-NEXT: v_mov_b32_e32 v8, s11 -; GCN-O0-NEXT: v_mov_b32_e32 v9, s12 -; GCN-O0-NEXT: v_mov_b32_e32 v10, s13 -; GCN-O0-NEXT: v_mov_b32_e32 v11, s14 -; GCN-O0-NEXT: v_mov_b32_e32 v12, s15 -; GCN-O0-NEXT: v_mov_b32_e32 v13, s16 -; GCN-O0-NEXT: v_mov_b32_e32 v14, s17 -; GCN-O0-NEXT: v_mov_b32_e32 v15, s18 -; GCN-O0-NEXT: v_mov_b32_e32 v16, s19 +; GCN-O0-NEXT: v_movrels_b32_e32 v2, v0 +; GCN-O0-NEXT: s_mov_b32 s3, 1 +; GCN-O0-NEXT: s_add_i32 s2, s2, s3 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v4, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v5, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v6, s7 +; GCN-O0-NEXT: v_mov_b32_e32 v7, s8 +; GCN-O0-NEXT: v_mov_b32_e32 v8, s9 +; GCN-O0-NEXT: v_mov_b32_e32 v9, s10 +; GCN-O0-NEXT: v_mov_b32_e32 v10, s11 +; GCN-O0-NEXT: v_mov_b32_e32 v11, s12 +; GCN-O0-NEXT: v_mov_b32_e32 v12, s13 +; GCN-O0-NEXT: v_mov_b32_e32 v13, s14 +; GCN-O0-NEXT: v_mov_b32_e32 v14, s15 +; GCN-O0-NEXT: v_mov_b32_e32 v15, s16 +; GCN-O0-NEXT: v_mov_b32_e32 v16, s17 +; GCN-O0-NEXT: v_mov_b32_e32 v17, s18 +; GCN-O0-NEXT: v_mov_b32_e32 v18, s19 ; GCN-O0-NEXT: s_mov_b32 m0, s2 -; GCN-O0-NEXT: v_movrels_b32_e32 v2, v1 +; GCN-O0-NEXT: v_movrels_b32_e32 v0, v3 ; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GCN-O0-NEXT: v_mov_b32_e32 v3, v0 ; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 @@ -897,6 +948,10 @@ define amdgpu_kernel void @double7_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-O0-LABEL: double7_extelt: ; GCN-O0: ; %bb.0: ; %entry ; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0x2c +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-O0-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN-O0-NEXT: s_mov_b32 s4, 0 ; GCN-O0-NEXT: s_mov_b32 s5, 0x401c0000 @@ -943,9 +998,9 @@ define amdgpu_kernel void @double7_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-O0-NEXT: s_mov_b32 s17, s21 ; GCN-O0-NEXT: s_mov_b32 s18, s20 ; GCN-O0-NEXT: s_mov_b32 s19, s3 -; GCN-O0-NEXT: s_mov_b32 s3, 1 +; GCN-O0-NEXT: s_mov_b32 s3, 2 ; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: s_lshl_b32 s2, s2, s3 +; GCN-O0-NEXT: s_mul_i32 s2, s2, s3 ; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 ; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 ; GCN-O0-NEXT: v_mov_b32_e32 v2, s6 @@ -963,25 +1018,27 @@ define amdgpu_kernel void @double7_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-O0-NEXT: v_mov_b32_e32 v14, s18 ; GCN-O0-NEXT: v_mov_b32_e32 v15, s19 ; GCN-O0-NEXT: s_mov_b32 m0, s2 -; GCN-O0-NEXT: v_movrels_b32_e32 v0, v1 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s5 -; GCN-O0-NEXT: v_mov_b32_e32 v3, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v4, s7 -; GCN-O0-NEXT: v_mov_b32_e32 v5, s8 -; GCN-O0-NEXT: v_mov_b32_e32 v6, s9 -; GCN-O0-NEXT: v_mov_b32_e32 v7, s10 -; GCN-O0-NEXT: v_mov_b32_e32 v8, s11 -; GCN-O0-NEXT: v_mov_b32_e32 v9, s12 -; GCN-O0-NEXT: v_mov_b32_e32 v10, s13 -; GCN-O0-NEXT: v_mov_b32_e32 v11, s14 -; GCN-O0-NEXT: v_mov_b32_e32 v12, s15 -; GCN-O0-NEXT: v_mov_b32_e32 v13, s16 -; GCN-O0-NEXT: v_mov_b32_e32 v14, s17 -; GCN-O0-NEXT: v_mov_b32_e32 v15, s18 -; GCN-O0-NEXT: v_mov_b32_e32 v16, s19 +; GCN-O0-NEXT: v_movrels_b32_e32 v2, v0 +; GCN-O0-NEXT: s_mov_b32 s3, 1 +; GCN-O0-NEXT: s_add_i32 s2, s2, s3 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v4, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v5, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v6, s7 +; GCN-O0-NEXT: v_mov_b32_e32 v7, s8 +; GCN-O0-NEXT: v_mov_b32_e32 v8, s9 +; GCN-O0-NEXT: v_mov_b32_e32 v9, s10 +; GCN-O0-NEXT: v_mov_b32_e32 v10, s11 +; GCN-O0-NEXT: v_mov_b32_e32 v11, s12 +; GCN-O0-NEXT: v_mov_b32_e32 v12, s13 +; GCN-O0-NEXT: v_mov_b32_e32 v13, s14 +; GCN-O0-NEXT: v_mov_b32_e32 v14, s15 +; GCN-O0-NEXT: v_mov_b32_e32 v15, s16 +; GCN-O0-NEXT: v_mov_b32_e32 v16, s17 +; GCN-O0-NEXT: v_mov_b32_e32 v17, s18 +; GCN-O0-NEXT: v_mov_b32_e32 v18, s19 ; GCN-O0-NEXT: s_mov_b32 m0, s2 -; GCN-O0-NEXT: v_movrels_b32_e32 v2, v1 +; GCN-O0-NEXT: v_movrels_b32_e32 v0, v3 ; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GCN-O0-NEXT: v_mov_b32_e32 v3, v0 ; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 @@ -1026,6 +1083,10 @@ define amdgpu_kernel void @float16_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-O0-LABEL: float16_extelt: ; GCN-O0: ; %bb.0: ; %entry ; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0x2c +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-O0-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN-O0-NEXT: s_mov_b32 s3, 0x41800000 ; GCN-O0-NEXT: s_mov_b32 s4, 0x41700000 @@ -1167,6 +1228,10 @@ define amdgpu_kernel void @double15_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-O0-LABEL: double15_extelt: ; GCN-O0: ; %bb.0: ; %entry ; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0x2c +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-O0-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN-O0-NEXT: s_mov_b32 s6, 0 ; GCN-O0-NEXT: s_mov_b32 s7, 0x402e0000 @@ -1261,9 +1326,9 @@ define amdgpu_kernel void @double15_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-O0-NEXT: s_mov_b32 s65, s5 ; GCN-O0-NEXT: s_mov_b32 s66, s4 ; GCN-O0-NEXT: s_mov_b32 s67, s3 -; GCN-O0-NEXT: s_mov_b32 s3, 1 +; GCN-O0-NEXT: s_mov_b32 s3, 2 ; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: s_lshl_b32 s2, s2, s3 +; GCN-O0-NEXT: s_mul_i32 s2, s2, s3 ; GCN-O0-NEXT: v_mov_b32_e32 v0, s36 ; GCN-O0-NEXT: v_mov_b32_e32 v1, s37 ; GCN-O0-NEXT: v_mov_b32_e32 v2, s38 @@ -1297,41 +1362,43 @@ define amdgpu_kernel void @double15_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-O0-NEXT: v_mov_b32_e32 v30, s66 ; GCN-O0-NEXT: v_mov_b32_e32 v31, s67 ; GCN-O0-NEXT: s_mov_b32 m0, s2 -; GCN-O0-NEXT: v_movrels_b32_e32 v0, v1 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s36 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s37 -; GCN-O0-NEXT: v_mov_b32_e32 v3, s38 -; GCN-O0-NEXT: v_mov_b32_e32 v4, s39 -; GCN-O0-NEXT: v_mov_b32_e32 v5, s40 -; GCN-O0-NEXT: v_mov_b32_e32 v6, s41 -; GCN-O0-NEXT: v_mov_b32_e32 v7, s42 -; GCN-O0-NEXT: v_mov_b32_e32 v8, s43 -; GCN-O0-NEXT: v_mov_b32_e32 v9, s44 -; GCN-O0-NEXT: v_mov_b32_e32 v10, s45 -; GCN-O0-NEXT: v_mov_b32_e32 v11, s46 -; GCN-O0-NEXT: v_mov_b32_e32 v12, s47 -; GCN-O0-NEXT: v_mov_b32_e32 v13, s48 -; GCN-O0-NEXT: v_mov_b32_e32 v14, s49 -; GCN-O0-NEXT: v_mov_b32_e32 v15, s50 -; GCN-O0-NEXT: v_mov_b32_e32 v16, s51 -; GCN-O0-NEXT: v_mov_b32_e32 v17, s52 -; GCN-O0-NEXT: v_mov_b32_e32 v18, s53 -; GCN-O0-NEXT: v_mov_b32_e32 v19, s54 -; GCN-O0-NEXT: v_mov_b32_e32 v20, s55 -; GCN-O0-NEXT: v_mov_b32_e32 v21, s56 -; GCN-O0-NEXT: v_mov_b32_e32 v22, s57 -; GCN-O0-NEXT: v_mov_b32_e32 v23, s58 -; GCN-O0-NEXT: v_mov_b32_e32 v24, s59 -; GCN-O0-NEXT: v_mov_b32_e32 v25, s60 -; GCN-O0-NEXT: v_mov_b32_e32 v26, s61 -; GCN-O0-NEXT: v_mov_b32_e32 v27, s62 -; GCN-O0-NEXT: v_mov_b32_e32 v28, s63 -; GCN-O0-NEXT: v_mov_b32_e32 v29, s64 -; GCN-O0-NEXT: v_mov_b32_e32 v30, s65 -; GCN-O0-NEXT: v_mov_b32_e32 v31, s66 -; GCN-O0-NEXT: v_mov_b32_e32 v32, s67 +; GCN-O0-NEXT: v_movrels_b32_e32 v2, v0 +; GCN-O0-NEXT: s_mov_b32 s3, 1 +; GCN-O0-NEXT: s_add_i32 s2, s2, s3 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s36 +; GCN-O0-NEXT: v_mov_b32_e32 v4, s37 +; GCN-O0-NEXT: v_mov_b32_e32 v5, s38 +; GCN-O0-NEXT: v_mov_b32_e32 v6, s39 +; GCN-O0-NEXT: v_mov_b32_e32 v7, s40 +; GCN-O0-NEXT: v_mov_b32_e32 v8, s41 +; GCN-O0-NEXT: v_mov_b32_e32 v9, s42 +; GCN-O0-NEXT: v_mov_b32_e32 v10, s43 +; GCN-O0-NEXT: v_mov_b32_e32 v11, s44 +; GCN-O0-NEXT: v_mov_b32_e32 v12, s45 +; GCN-O0-NEXT: v_mov_b32_e32 v13, s46 +; GCN-O0-NEXT: v_mov_b32_e32 v14, s47 +; GCN-O0-NEXT: v_mov_b32_e32 v15, s48 +; GCN-O0-NEXT: v_mov_b32_e32 v16, s49 +; GCN-O0-NEXT: v_mov_b32_e32 v17, s50 +; GCN-O0-NEXT: v_mov_b32_e32 v18, s51 +; GCN-O0-NEXT: v_mov_b32_e32 v19, s52 +; GCN-O0-NEXT: v_mov_b32_e32 v20, s53 +; GCN-O0-NEXT: v_mov_b32_e32 v21, s54 +; GCN-O0-NEXT: v_mov_b32_e32 v22, s55 +; GCN-O0-NEXT: v_mov_b32_e32 v23, s56 +; GCN-O0-NEXT: v_mov_b32_e32 v24, s57 +; GCN-O0-NEXT: v_mov_b32_e32 v25, s58 +; GCN-O0-NEXT: v_mov_b32_e32 v26, s59 +; GCN-O0-NEXT: v_mov_b32_e32 v27, s60 +; GCN-O0-NEXT: v_mov_b32_e32 v28, s61 +; GCN-O0-NEXT: v_mov_b32_e32 v29, s62 +; GCN-O0-NEXT: v_mov_b32_e32 v30, s63 +; GCN-O0-NEXT: v_mov_b32_e32 v31, s64 +; GCN-O0-NEXT: v_mov_b32_e32 v32, s65 +; GCN-O0-NEXT: v_mov_b32_e32 v33, s66 +; GCN-O0-NEXT: v_mov_b32_e32 v34, s67 ; GCN-O0-NEXT: s_mov_b32 m0, s2 -; GCN-O0-NEXT: v_movrels_b32_e32 v2, v1 +; GCN-O0-NEXT: v_movrels_b32_e32 v0, v3 ; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GCN-O0-NEXT: v_mov_b32_e32 v3, v0 ; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 @@ -1425,6 +1492,10 @@ define amdgpu_kernel void @double16_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-O0-LABEL: double16_extelt: ; GCN-O0: ; %bb.0: ; %entry ; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0x2c +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-O0-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN-O0-NEXT: s_mov_b32 s4, 0 ; GCN-O0-NEXT: s_mov_b32 s5, 0x40300000 @@ -1519,9 +1590,9 @@ define amdgpu_kernel void @double16_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-O0-NEXT: s_mov_b32 s65, s5 ; GCN-O0-NEXT: s_mov_b32 s66, s4 ; GCN-O0-NEXT: s_mov_b32 s67, s3 -; GCN-O0-NEXT: s_mov_b32 s3, 1 +; GCN-O0-NEXT: s_mov_b32 s3, 2 ; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: s_lshl_b32 s2, s2, s3 +; GCN-O0-NEXT: s_mul_i32 s2, s2, s3 ; GCN-O0-NEXT: v_mov_b32_e32 v0, s36 ; GCN-O0-NEXT: v_mov_b32_e32 v1, s37 ; GCN-O0-NEXT: v_mov_b32_e32 v2, s38 @@ -1555,41 +1626,43 @@ define amdgpu_kernel void @double16_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-O0-NEXT: v_mov_b32_e32 v30, s66 ; GCN-O0-NEXT: v_mov_b32_e32 v31, s67 ; GCN-O0-NEXT: s_mov_b32 m0, s2 -; GCN-O0-NEXT: v_movrels_b32_e32 v0, v1 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s36 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s37 -; GCN-O0-NEXT: v_mov_b32_e32 v3, s38 -; GCN-O0-NEXT: v_mov_b32_e32 v4, s39 -; GCN-O0-NEXT: v_mov_b32_e32 v5, s40 -; GCN-O0-NEXT: v_mov_b32_e32 v6, s41 -; GCN-O0-NEXT: v_mov_b32_e32 v7, s42 -; GCN-O0-NEXT: v_mov_b32_e32 v8, s43 -; GCN-O0-NEXT: v_mov_b32_e32 v9, s44 -; GCN-O0-NEXT: v_mov_b32_e32 v10, s45 -; GCN-O0-NEXT: v_mov_b32_e32 v11, s46 -; GCN-O0-NEXT: v_mov_b32_e32 v12, s47 -; GCN-O0-NEXT: v_mov_b32_e32 v13, s48 -; GCN-O0-NEXT: v_mov_b32_e32 v14, s49 -; GCN-O0-NEXT: v_mov_b32_e32 v15, s50 -; GCN-O0-NEXT: v_mov_b32_e32 v16, s51 -; GCN-O0-NEXT: v_mov_b32_e32 v17, s52 -; GCN-O0-NEXT: v_mov_b32_e32 v18, s53 -; GCN-O0-NEXT: v_mov_b32_e32 v19, s54 -; GCN-O0-NEXT: v_mov_b32_e32 v20, s55 -; GCN-O0-NEXT: v_mov_b32_e32 v21, s56 -; GCN-O0-NEXT: v_mov_b32_e32 v22, s57 -; GCN-O0-NEXT: v_mov_b32_e32 v23, s58 -; GCN-O0-NEXT: v_mov_b32_e32 v24, s59 -; GCN-O0-NEXT: v_mov_b32_e32 v25, s60 -; GCN-O0-NEXT: v_mov_b32_e32 v26, s61 -; GCN-O0-NEXT: v_mov_b32_e32 v27, s62 -; GCN-O0-NEXT: v_mov_b32_e32 v28, s63 -; GCN-O0-NEXT: v_mov_b32_e32 v29, s64 -; GCN-O0-NEXT: v_mov_b32_e32 v30, s65 -; GCN-O0-NEXT: v_mov_b32_e32 v31, s66 -; GCN-O0-NEXT: v_mov_b32_e32 v32, s67 +; GCN-O0-NEXT: v_movrels_b32_e32 v2, v0 +; GCN-O0-NEXT: s_mov_b32 s3, 1 +; GCN-O0-NEXT: s_add_i32 s2, s2, s3 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s36 +; GCN-O0-NEXT: v_mov_b32_e32 v4, s37 +; GCN-O0-NEXT: v_mov_b32_e32 v5, s38 +; GCN-O0-NEXT: v_mov_b32_e32 v6, s39 +; GCN-O0-NEXT: v_mov_b32_e32 v7, s40 +; GCN-O0-NEXT: v_mov_b32_e32 v8, s41 +; GCN-O0-NEXT: v_mov_b32_e32 v9, s42 +; GCN-O0-NEXT: v_mov_b32_e32 v10, s43 +; GCN-O0-NEXT: v_mov_b32_e32 v11, s44 +; GCN-O0-NEXT: v_mov_b32_e32 v12, s45 +; GCN-O0-NEXT: v_mov_b32_e32 v13, s46 +; GCN-O0-NEXT: v_mov_b32_e32 v14, s47 +; GCN-O0-NEXT: v_mov_b32_e32 v15, s48 +; GCN-O0-NEXT: v_mov_b32_e32 v16, s49 +; GCN-O0-NEXT: v_mov_b32_e32 v17, s50 +; GCN-O0-NEXT: v_mov_b32_e32 v18, s51 +; GCN-O0-NEXT: v_mov_b32_e32 v19, s52 +; GCN-O0-NEXT: v_mov_b32_e32 v20, s53 +; GCN-O0-NEXT: v_mov_b32_e32 v21, s54 +; GCN-O0-NEXT: v_mov_b32_e32 v22, s55 +; GCN-O0-NEXT: v_mov_b32_e32 v23, s56 +; GCN-O0-NEXT: v_mov_b32_e32 v24, s57 +; GCN-O0-NEXT: v_mov_b32_e32 v25, s58 +; GCN-O0-NEXT: v_mov_b32_e32 v26, s59 +; GCN-O0-NEXT: v_mov_b32_e32 v27, s60 +; GCN-O0-NEXT: v_mov_b32_e32 v28, s61 +; GCN-O0-NEXT: v_mov_b32_e32 v29, s62 +; GCN-O0-NEXT: v_mov_b32_e32 v30, s63 +; GCN-O0-NEXT: v_mov_b32_e32 v31, s64 +; GCN-O0-NEXT: v_mov_b32_e32 v32, s65 +; GCN-O0-NEXT: v_mov_b32_e32 v33, s66 +; GCN-O0-NEXT: v_mov_b32_e32 v34, s67 ; GCN-O0-NEXT: s_mov_b32 m0, s2 -; GCN-O0-NEXT: v_movrels_b32_e32 v2, v1 +; GCN-O0-NEXT: v_movrels_b32_e32 v0, v3 ; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GCN-O0-NEXT: v_mov_b32_e32 v3, v0 ; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 @@ -1650,6 +1723,10 @@ define amdgpu_kernel void @float32_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-O0-LABEL: float32_extelt: ; GCN-O0: ; %bb.0: ; %entry ; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0x2c +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-O0-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN-O0-NEXT: s_mov_b32 s3, 0x42000000 ; GCN-O0-NEXT: s_mov_b32 s4, 0x41f80000 @@ -1778,15 +1855,43 @@ define amdgpu_kernel void @byte8_extelt(ptr addrspace(1) %out, i32 %sel) { ; ; GCN-O0-LABEL: byte8_extelt: ; GCN-O0: ; %bb.0: ; %entry -; GCN-O0-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 -; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0x2c -; GCN-O0-NEXT: s_mov_b32 s1, 3 +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: s_lshl_b32 s4, s0, s1 -; GCN-O0-NEXT: s_mov_b32 s5, 0x8070605 -; GCN-O0-NEXT: s_mov_b32 s0, 0x4030201 +; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0x2c +; GCN-O0-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GCN-O0-NEXT: s_load_dword s4, s[4:5], 0x2c +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_mov_b32 s0, 0x200 +; GCN-O0-NEXT: s_mov_b32 s1, 1 +; GCN-O0-NEXT: s_or_b32 s0, s0, s1 +; GCN-O0-NEXT: s_mov_b32 s6, 0xffff +; GCN-O0-NEXT: s_and_b32 s0, s6, s0 +; GCN-O0-NEXT: s_mov_b32 s1, 0x400 +; GCN-O0-NEXT: s_mov_b32 s5, 3 +; GCN-O0-NEXT: s_or_b32 s1, s1, s5 +; GCN-O0-NEXT: s_mov_b32 s8, 16 +; GCN-O0-NEXT: s_lshl_b32 s1, s1, s8 +; GCN-O0-NEXT: s_or_b32 s0, s0, s1 +; GCN-O0-NEXT: s_mov_b32 s7, 0 ; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 -; GCN-O0-NEXT: s_mov_b32 s1, s5 +; GCN-O0-NEXT: s_mov_b32 s1, s7 +; GCN-O0-NEXT: s_mov_b32 s7, 0x600 +; GCN-O0-NEXT: s_mov_b32 s9, 5 +; GCN-O0-NEXT: s_or_b32 s7, s7, s9 +; GCN-O0-NEXT: s_and_b32 s6, s6, s7 +; GCN-O0-NEXT: s_mov_b32 s7, 0x800 +; GCN-O0-NEXT: s_mov_b32 s9, 7 +; GCN-O0-NEXT: s_or_b32 s7, s7, s9 +; GCN-O0-NEXT: s_lshl_b32 s7, s7, s8 +; GCN-O0-NEXT: s_or_b32 s6, s6, s7 +; GCN-O0-NEXT: ; implicit-def: $sgpr8 +; GCN-O0-NEXT: ; implicit-def: $sgpr7 +; GCN-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GCN-O0-NEXT: s_mov_b32 s7, s8 +; GCN-O0-NEXT: s_mov_b32 s8, 32 +; GCN-O0-NEXT: s_lshl_b64 s[6:7], s[6:7], s8 +; GCN-O0-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] +; GCN-O0-NEXT: s_lshl_b32 s4, s4, s5 ; GCN-O0-NEXT: s_lshr_b64 s[0:1], s[0:1], s4 ; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 ; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 @@ -1851,44 +1956,71 @@ define amdgpu_kernel void @byte16_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-O0-NEXT: s_add_u32 s12, s12, s11 ; GCN-O0-NEXT: s_addc_u32 s13, s13, 0 ; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0x2c +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-O0-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN-O0-NEXT: s_mov_b32 s3, 15 ; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: s_and_b32 s3, s2, s3 -; GCN-O0-NEXT: s_mov_b32 s2, 0 -; GCN-O0-NEXT: s_or_b32 s2, s2, s3 -; GCN-O0-NEXT: v_mov_b32_e32 v0, 16 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:15 -; GCN-O0-NEXT: v_mov_b32_e32 v0, 15 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:14 -; GCN-O0-NEXT: v_mov_b32_e32 v0, 14 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:13 -; GCN-O0-NEXT: v_mov_b32_e32 v0, 13 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:12 -; GCN-O0-NEXT: v_mov_b32_e32 v0, 12 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:11 -; GCN-O0-NEXT: v_mov_b32_e32 v0, 11 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:10 -; GCN-O0-NEXT: v_mov_b32_e32 v0, 10 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:9 -; GCN-O0-NEXT: v_mov_b32_e32 v0, 9 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:8 -; GCN-O0-NEXT: v_mov_b32_e32 v0, 8 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:7 -; GCN-O0-NEXT: v_mov_b32_e32 v0, 7 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:6 -; GCN-O0-NEXT: v_mov_b32_e32 v0, 6 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:5 -; GCN-O0-NEXT: v_mov_b32_e32 v0, 5 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:4 +; GCN-O0-NEXT: s_and_b32 s2, s2, s3 +; GCN-O0-NEXT: s_mov_b32 s3, 1 +; GCN-O0-NEXT: s_mul_i32 s2, s2, s3 +; GCN-O0-NEXT: s_mov_b32 s3, 0 +; GCN-O0-NEXT: s_add_i32 s2, s3, s2 +; GCN-O0-NEXT: s_mov_b32 s4, 2 +; GCN-O0-NEXT: s_add_i32 s5, s3, s4 ; GCN-O0-NEXT: v_mov_b32_e32 v0, 4 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:3 -; GCN-O0-NEXT: v_mov_b32_e32 v0, 3 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:2 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[12:15], 0 offen offset:1 +; GCN-O0-NEXT: s_mov_b32 s5, 4 +; GCN-O0-NEXT: s_add_i32 s6, s3, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v0, 6 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s6 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[12:15], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v0, 7 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s6 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[12:15], 0 offen offset:2 +; GCN-O0-NEXT: s_add_i32 s6, s6, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v0, 8 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s6 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[12:15], 0 offen offset:1 +; GCN-O0-NEXT: s_mov_b32 s6, 8 +; GCN-O0-NEXT: s_add_i32 s3, s3, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v0, 10 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[12:15], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v0, 11 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[12:15], 0 offen offset:2 +; GCN-O0-NEXT: s_add_i32 s6, s3, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v0, 12 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s6 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[12:15], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v0, 13 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[12:15], 0 offen offset:4 +; GCN-O0-NEXT: s_add_i32 s3, s3, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v0, 14 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[12:15], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v0, 15 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[12:15], 0 offen offset:2 +; GCN-O0-NEXT: s_add_i32 s3, s3, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v0, 16 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[12:15], 0 offen offset:1 ; GCN-O0-NEXT: v_mov_b32_e32 v0, 2 ; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:1 ; GCN-O0-NEXT: v_mov_b32_e32 v0, 1 ; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 +; GCN-O0-NEXT: v_mov_b32_e32 v0, 3 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:2 +; GCN-O0-NEXT: v_mov_b32_e32 v0, 5 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:4 +; GCN-O0-NEXT: v_mov_b32_e32 v0, 9 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:8 ; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 ; GCN-O0-NEXT: buffer_load_ubyte v2, v0, s[12:15], 0 offen ; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 @@ -1919,18 +2051,28 @@ define amdgpu_kernel void @bit4_extelt(ptr addrspace(1) %out, i32 %sel) { ; ; GCN-O0-LABEL: bit4_extelt: ; GCN-O0: ; %bb.0: ; %entry -; GCN-O0-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 -; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0x2c -; GCN-O0-NEXT: s_mov_b32 s1, 3 +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: s_lshl_b32 s1, s0, s1 -; GCN-O0-NEXT: s_mov_b32 s0, 0x1000100 -; GCN-O0-NEXT: s_lshr_b32 s0, s0, s1 -; GCN-O0-NEXT: s_mov_b32 s1, 1 -; GCN-O0-NEXT: s_and_b32 s0, s0, s1 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s0 +; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0x2c +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-O0-NEXT: s_load_dword s3, s[4:5], 0x2c +; GCN-O0-NEXT: s_mov_b32 s2, 0xffff +; GCN-O0-NEXT: s_mov_b32 s4, 0x100 +; GCN-O0-NEXT: s_and_b32 s2, s2, s4 +; GCN-O0-NEXT: s_mov_b32 s5, 16 +; GCN-O0-NEXT: s_lshl_b32 s4, s4, s5 +; GCN-O0-NEXT: s_or_b32 s2, s2, s4 +; GCN-O0-NEXT: s_mov_b32 s4, 3 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_lshl_b32 s3, s3, s4 +; GCN-O0-NEXT: s_lshr_b32 s2, s2, s3 +; GCN-O0-NEXT: s_and_b32 s2, 1, s2 +; GCN-O0-NEXT: s_cmp_eq_u32 s2, 1 +; GCN-O0-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3] +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 ; GCN-O0-NEXT: flat_store_dword v[0:1], v2 ; GCN-O0-NEXT: s_endpgm entry: @@ -2211,154 +2353,351 @@ define amdgpu_kernel void @bit128_extelt(ptr addrspace(1) %out, i32 %sel) { ; ; GCN-O0-LABEL: bit128_extelt: ; GCN-O0: ; %bb.0: ; %entry -; GCN-O0-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GCN-O0-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GCN-O0-NEXT: s_mov_b32 s14, -1 -; GCN-O0-NEXT: s_mov_b32 s15, 0xe80000 -; GCN-O0-NEXT: s_add_u32 s12, s12, s11 -; GCN-O0-NEXT: s_addc_u32 s13, s13, 0 +; GCN-O0-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 +; GCN-O0-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 +; GCN-O0-NEXT: s_mov_b32 s18, -1 +; GCN-O0-NEXT: s_mov_b32 s19, 0xe80000 +; GCN-O0-NEXT: s_add_u32 s16, s16, s11 +; GCN-O0-NEXT: s_addc_u32 s17, s17, 0 +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0x2c +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) ; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-O0-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN-O0-NEXT: s_mov_b32 s3, 0x7f ; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: s_and_b32 s3, s2, s3 -; GCN-O0-NEXT: s_mov_b32 s2, 0 -; GCN-O0-NEXT: s_add_i32 s2, s2, s3 +; GCN-O0-NEXT: s_and_b32 s2, s2, s3 +; GCN-O0-NEXT: s_mov_b32 s4, 1 +; GCN-O0-NEXT: s_mul_i32 s2, s2, s4 +; GCN-O0-NEXT: s_mov_b32 s3, 0 +; GCN-O0-NEXT: s_add_i32 s2, s3, s2 +; GCN-O0-NEXT: s_mov_b32 s5, 2 +; GCN-O0-NEXT: s_add_i32 s6, s3, s5 ; GCN-O0-NEXT: v_mov_b32_e32 v1, 0 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:127 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s6 +; GCN-O0-NEXT: buffer_store_byte v1, v0, s[16:19], 0 offen offset:1 +; GCN-O0-NEXT: s_mov_b32 s6, 4 +; GCN-O0-NEXT: s_add_i32 s7, s3, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s7 +; GCN-O0-NEXT: buffer_store_byte v1, v0, s[16:19], 0 offen offset:1 ; GCN-O0-NEXT: v_mov_b32_e32 v0, 1 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:126 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:125 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:124 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:123 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:122 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:121 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:120 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:119 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:118 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:117 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:116 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:115 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:114 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:113 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:112 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:111 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:110 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:109 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:108 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:107 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:106 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:105 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:104 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:103 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:102 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:101 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:100 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:99 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:98 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:97 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:96 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:95 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:94 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:93 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:92 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:91 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:90 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:89 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:88 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:87 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:86 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:85 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:84 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:83 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:82 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:81 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:80 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:79 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:78 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:77 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:76 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:75 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:74 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:73 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:72 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:71 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:70 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:69 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:68 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:67 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:66 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:65 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:64 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:63 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:62 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:61 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:60 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:59 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:58 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:57 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:56 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:55 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:54 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:53 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:52 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:51 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:50 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:49 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:48 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:47 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:46 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:45 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:44 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:43 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:42 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:41 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:40 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:39 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:38 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:37 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:36 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:35 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:34 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:33 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:32 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:31 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:30 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:29 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:28 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:27 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:26 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:25 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:24 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:23 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:22 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:21 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:20 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:19 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:18 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:17 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:16 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:15 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:14 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:13 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:12 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:11 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:10 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:9 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:8 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:7 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:6 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:5 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:4 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:3 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:2 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:1 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s7 +; GCN-O0-NEXT: buffer_store_byte v0, v2, s[16:19], 0 offen offset:2 +; GCN-O0-NEXT: s_add_i32 s7, s7, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s7 +; GCN-O0-NEXT: buffer_store_byte v1, v2, s[16:19], 0 offen offset:1 +; GCN-O0-NEXT: s_mov_b32 s7, 8 +; GCN-O0-NEXT: s_add_i32 s8, s3, s7 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s8 +; GCN-O0-NEXT: buffer_store_byte v1, v2, s[16:19], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s8 +; GCN-O0-NEXT: buffer_store_byte v0, v2, s[16:19], 0 offen offset:2 +; GCN-O0-NEXT: s_add_i32 s9, s8, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s9 +; GCN-O0-NEXT: buffer_store_byte v1, v2, s[16:19], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s8 +; GCN-O0-NEXT: buffer_store_byte v0, v2, s[16:19], 0 offen offset:4 +; GCN-O0-NEXT: s_add_i32 s8, s8, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s8 +; GCN-O0-NEXT: buffer_store_byte v1, v2, s[16:19], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s8 +; GCN-O0-NEXT: buffer_store_byte v0, v2, s[16:19], 0 offen offset:2 +; GCN-O0-NEXT: s_add_i32 s8, s8, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s8 +; GCN-O0-NEXT: buffer_store_byte v1, v2, s[16:19], 0 offen offset:1 +; GCN-O0-NEXT: s_mov_b32 s8, 16 +; GCN-O0-NEXT: s_add_i32 s9, s3, s8 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s9 +; GCN-O0-NEXT: buffer_store_byte v1, v2, s[16:19], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s9 +; GCN-O0-NEXT: buffer_store_byte v0, v2, s[16:19], 0 offen offset:2 +; GCN-O0-NEXT: s_add_i32 s10, s9, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s10 +; GCN-O0-NEXT: buffer_store_byte v1, v2, s[16:19], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s9 +; GCN-O0-NEXT: buffer_store_byte v0, v2, s[16:19], 0 offen offset:4 +; GCN-O0-NEXT: s_add_i32 s10, s9, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s10 +; GCN-O0-NEXT: buffer_store_byte v1, v2, s[16:19], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s10 +; GCN-O0-NEXT: buffer_store_byte v0, v2, s[16:19], 0 offen offset:2 +; GCN-O0-NEXT: s_add_i32 s10, s10, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s10 +; GCN-O0-NEXT: buffer_store_byte v1, v2, s[16:19], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s9 +; GCN-O0-NEXT: buffer_store_byte v0, v2, s[16:19], 0 offen offset:8 +; GCN-O0-NEXT: s_add_i32 s9, s9, s7 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s9 +; GCN-O0-NEXT: buffer_store_byte v1, v2, s[16:19], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s9 +; GCN-O0-NEXT: buffer_store_byte v0, v2, s[16:19], 0 offen offset:2 +; GCN-O0-NEXT: s_add_i32 s10, s9, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s10 +; GCN-O0-NEXT: buffer_store_byte v1, v2, s[16:19], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s9 +; GCN-O0-NEXT: buffer_store_byte v0, v2, s[16:19], 0 offen offset:4 +; GCN-O0-NEXT: s_add_i32 s9, s9, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s9 +; GCN-O0-NEXT: buffer_store_byte v1, v2, s[16:19], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s9 +; GCN-O0-NEXT: buffer_store_byte v0, v2, s[16:19], 0 offen offset:2 +; GCN-O0-NEXT: s_add_i32 s9, s9, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s9 +; GCN-O0-NEXT: buffer_store_byte v1, v2, s[16:19], 0 offen offset:1 +; GCN-O0-NEXT: s_mov_b32 s9, 32 +; GCN-O0-NEXT: s_add_i32 s10, s3, s9 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s10 +; GCN-O0-NEXT: buffer_store_byte v1, v2, s[16:19], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s10 +; GCN-O0-NEXT: buffer_store_byte v0, v2, s[16:19], 0 offen offset:2 +; GCN-O0-NEXT: s_add_i32 s11, s10, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s11 +; GCN-O0-NEXT: buffer_store_byte v1, v2, s[16:19], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s10 +; GCN-O0-NEXT: buffer_store_byte v0, v2, s[16:19], 0 offen offset:4 +; GCN-O0-NEXT: s_add_i32 s11, s10, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s11 +; GCN-O0-NEXT: buffer_store_byte v1, v2, s[16:19], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s11 +; GCN-O0-NEXT: buffer_store_byte v0, v2, s[16:19], 0 offen offset:2 +; GCN-O0-NEXT: s_add_i32 s11, s11, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s11 +; GCN-O0-NEXT: buffer_store_byte v1, v2, s[16:19], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s10 +; GCN-O0-NEXT: buffer_store_byte v0, v2, s[16:19], 0 offen offset:8 +; GCN-O0-NEXT: s_add_i32 s11, s10, s7 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s11 +; GCN-O0-NEXT: buffer_store_byte v1, v2, s[16:19], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s11 +; GCN-O0-NEXT: buffer_store_byte v0, v2, s[16:19], 0 offen offset:2 +; GCN-O0-NEXT: s_add_i32 s12, s11, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s12 +; GCN-O0-NEXT: buffer_store_byte v1, v2, s[16:19], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s11 +; GCN-O0-NEXT: buffer_store_byte v0, v2, s[16:19], 0 offen offset:4 +; GCN-O0-NEXT: s_add_i32 s11, s11, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s11 +; GCN-O0-NEXT: buffer_store_byte v1, v2, s[16:19], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s11 +; GCN-O0-NEXT: buffer_store_byte v0, v2, s[16:19], 0 offen offset:2 +; GCN-O0-NEXT: s_add_i32 s11, s11, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s11 +; GCN-O0-NEXT: buffer_store_byte v1, v2, s[16:19], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s10 +; GCN-O0-NEXT: buffer_store_byte v0, v2, s[16:19], 0 offen offset:16 +; GCN-O0-NEXT: s_add_i32 s10, s10, s8 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s10 +; GCN-O0-NEXT: buffer_store_byte v1, v2, s[16:19], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s10 +; GCN-O0-NEXT: buffer_store_byte v0, v2, s[16:19], 0 offen offset:2 +; GCN-O0-NEXT: s_add_i32 s11, s10, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s11 +; GCN-O0-NEXT: buffer_store_byte v1, v2, s[16:19], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s10 +; GCN-O0-NEXT: buffer_store_byte v0, v2, s[16:19], 0 offen offset:4 +; GCN-O0-NEXT: s_add_i32 s11, s10, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s11 +; GCN-O0-NEXT: buffer_store_byte v1, v2, s[16:19], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s11 +; GCN-O0-NEXT: buffer_store_byte v0, v2, s[16:19], 0 offen offset:2 +; GCN-O0-NEXT: s_add_i32 s11, s11, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s11 +; GCN-O0-NEXT: buffer_store_byte v1, v2, s[16:19], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s10 +; GCN-O0-NEXT: buffer_store_byte v0, v2, s[16:19], 0 offen offset:8 +; GCN-O0-NEXT: s_add_i32 s10, s10, s7 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s10 +; GCN-O0-NEXT: buffer_store_byte v1, v2, s[16:19], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s10 +; GCN-O0-NEXT: buffer_store_byte v0, v2, s[16:19], 0 offen offset:2 +; GCN-O0-NEXT: s_add_i32 s11, s10, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s11 +; GCN-O0-NEXT: buffer_store_byte v1, v2, s[16:19], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s10 +; GCN-O0-NEXT: buffer_store_byte v0, v2, s[16:19], 0 offen offset:4 +; GCN-O0-NEXT: s_add_i32 s10, s10, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s10 +; GCN-O0-NEXT: buffer_store_byte v1, v2, s[16:19], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s10 +; GCN-O0-NEXT: buffer_store_byte v0, v2, s[16:19], 0 offen offset:2 +; GCN-O0-NEXT: s_add_i32 s10, s10, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s10 +; GCN-O0-NEXT: buffer_store_byte v1, v2, s[16:19], 0 offen offset:1 +; GCN-O0-NEXT: s_mov_b32 s10, 64 +; GCN-O0-NEXT: s_add_i32 s3, s3, s10 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s3 +; GCN-O0-NEXT: buffer_store_byte v1, v2, s[16:19], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s3 +; GCN-O0-NEXT: buffer_store_byte v0, v2, s[16:19], 0 offen offset:2 +; GCN-O0-NEXT: s_add_i32 s10, s3, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s10 +; GCN-O0-NEXT: buffer_store_byte v1, v2, s[16:19], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s3 +; GCN-O0-NEXT: buffer_store_byte v0, v2, s[16:19], 0 offen offset:4 +; GCN-O0-NEXT: s_add_i32 s10, s3, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s10 +; GCN-O0-NEXT: buffer_store_byte v1, v2, s[16:19], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s10 +; GCN-O0-NEXT: buffer_store_byte v0, v2, s[16:19], 0 offen offset:2 +; GCN-O0-NEXT: s_add_i32 s10, s10, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s10 +; GCN-O0-NEXT: buffer_store_byte v1, v2, s[16:19], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s3 +; GCN-O0-NEXT: buffer_store_byte v0, v2, s[16:19], 0 offen offset:8 +; GCN-O0-NEXT: s_add_i32 s10, s3, s7 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s10 +; GCN-O0-NEXT: buffer_store_byte v1, v2, s[16:19], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s10 +; GCN-O0-NEXT: buffer_store_byte v0, v2, s[16:19], 0 offen offset:2 +; GCN-O0-NEXT: s_add_i32 s11, s10, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s11 +; GCN-O0-NEXT: buffer_store_byte v1, v2, s[16:19], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s10 +; GCN-O0-NEXT: buffer_store_byte v0, v2, s[16:19], 0 offen offset:4 +; GCN-O0-NEXT: s_add_i32 s10, s10, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s10 +; GCN-O0-NEXT: buffer_store_byte v1, v2, s[16:19], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s10 +; GCN-O0-NEXT: buffer_store_byte v0, v2, s[16:19], 0 offen offset:2 +; GCN-O0-NEXT: s_add_i32 s10, s10, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s10 +; GCN-O0-NEXT: buffer_store_byte v1, v2, s[16:19], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s3 +; GCN-O0-NEXT: buffer_store_byte v0, v2, s[16:19], 0 offen offset:16 +; GCN-O0-NEXT: s_add_i32 s10, s3, s8 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s10 +; GCN-O0-NEXT: buffer_store_byte v1, v2, s[16:19], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s10 +; GCN-O0-NEXT: buffer_store_byte v0, v2, s[16:19], 0 offen offset:2 +; GCN-O0-NEXT: s_add_i32 s11, s10, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s11 +; GCN-O0-NEXT: buffer_store_byte v1, v2, s[16:19], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s10 +; GCN-O0-NEXT: buffer_store_byte v0, v2, s[16:19], 0 offen offset:4 +; GCN-O0-NEXT: s_add_i32 s11, s10, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s11 +; GCN-O0-NEXT: buffer_store_byte v1, v2, s[16:19], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s11 +; GCN-O0-NEXT: buffer_store_byte v0, v2, s[16:19], 0 offen offset:2 +; GCN-O0-NEXT: s_add_i32 s11, s11, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s11 +; GCN-O0-NEXT: buffer_store_byte v1, v2, s[16:19], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s10 +; GCN-O0-NEXT: buffer_store_byte v0, v2, s[16:19], 0 offen offset:8 +; GCN-O0-NEXT: s_add_i32 s10, s10, s7 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s10 +; GCN-O0-NEXT: buffer_store_byte v1, v2, s[16:19], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s10 +; GCN-O0-NEXT: buffer_store_byte v0, v2, s[16:19], 0 offen offset:2 +; GCN-O0-NEXT: s_add_i32 s11, s10, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s11 +; GCN-O0-NEXT: buffer_store_byte v1, v2, s[16:19], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s10 +; GCN-O0-NEXT: buffer_store_byte v0, v2, s[16:19], 0 offen offset:4 +; GCN-O0-NEXT: s_add_i32 s10, s10, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s10 +; GCN-O0-NEXT: buffer_store_byte v1, v2, s[16:19], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s10 +; GCN-O0-NEXT: buffer_store_byte v0, v2, s[16:19], 0 offen offset:2 +; GCN-O0-NEXT: s_add_i32 s10, s10, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s10 +; GCN-O0-NEXT: buffer_store_byte v1, v2, s[16:19], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s3 +; GCN-O0-NEXT: buffer_store_byte v0, v2, s[16:19], 0 offen offset:32 +; GCN-O0-NEXT: s_add_i32 s3, s3, s9 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s3 +; GCN-O0-NEXT: buffer_store_byte v1, v2, s[16:19], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s3 +; GCN-O0-NEXT: buffer_store_byte v0, v2, s[16:19], 0 offen offset:2 +; GCN-O0-NEXT: s_add_i32 s9, s3, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s9 +; GCN-O0-NEXT: buffer_store_byte v1, v2, s[16:19], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s3 +; GCN-O0-NEXT: buffer_store_byte v0, v2, s[16:19], 0 offen offset:4 +; GCN-O0-NEXT: s_add_i32 s9, s3, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s9 +; GCN-O0-NEXT: buffer_store_byte v1, v2, s[16:19], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s9 +; GCN-O0-NEXT: buffer_store_byte v0, v2, s[16:19], 0 offen offset:2 +; GCN-O0-NEXT: s_add_i32 s9, s9, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s9 +; GCN-O0-NEXT: buffer_store_byte v1, v2, s[16:19], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s3 +; GCN-O0-NEXT: buffer_store_byte v0, v2, s[16:19], 0 offen offset:8 +; GCN-O0-NEXT: s_add_i32 s9, s3, s7 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s9 +; GCN-O0-NEXT: buffer_store_byte v1, v2, s[16:19], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s9 +; GCN-O0-NEXT: buffer_store_byte v0, v2, s[16:19], 0 offen offset:2 +; GCN-O0-NEXT: s_add_i32 s10, s9, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s10 +; GCN-O0-NEXT: buffer_store_byte v1, v2, s[16:19], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s9 +; GCN-O0-NEXT: buffer_store_byte v0, v2, s[16:19], 0 offen offset:4 +; GCN-O0-NEXT: s_add_i32 s9, s9, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s9 +; GCN-O0-NEXT: buffer_store_byte v1, v2, s[16:19], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s9 +; GCN-O0-NEXT: buffer_store_byte v0, v2, s[16:19], 0 offen offset:2 +; GCN-O0-NEXT: s_add_i32 s9, s9, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s9 +; GCN-O0-NEXT: buffer_store_byte v1, v2, s[16:19], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s3 +; GCN-O0-NEXT: buffer_store_byte v0, v2, s[16:19], 0 offen offset:16 +; GCN-O0-NEXT: s_add_i32 s3, s3, s8 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s3 +; GCN-O0-NEXT: buffer_store_byte v1, v2, s[16:19], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s3 +; GCN-O0-NEXT: buffer_store_byte v0, v2, s[16:19], 0 offen offset:2 +; GCN-O0-NEXT: s_add_i32 s8, s3, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s8 +; GCN-O0-NEXT: buffer_store_byte v1, v2, s[16:19], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s3 +; GCN-O0-NEXT: buffer_store_byte v0, v2, s[16:19], 0 offen offset:4 +; GCN-O0-NEXT: s_add_i32 s8, s3, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s8 +; GCN-O0-NEXT: buffer_store_byte v1, v2, s[16:19], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s8 +; GCN-O0-NEXT: buffer_store_byte v0, v2, s[16:19], 0 offen offset:2 +; GCN-O0-NEXT: s_add_i32 s8, s8, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s8 +; GCN-O0-NEXT: buffer_store_byte v1, v2, s[16:19], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s3 +; GCN-O0-NEXT: buffer_store_byte v0, v2, s[16:19], 0 offen offset:8 +; GCN-O0-NEXT: s_add_i32 s3, s3, s7 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s3 +; GCN-O0-NEXT: buffer_store_byte v1, v2, s[16:19], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s3 +; GCN-O0-NEXT: buffer_store_byte v0, v2, s[16:19], 0 offen offset:2 +; GCN-O0-NEXT: s_add_i32 s7, s3, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s7 +; GCN-O0-NEXT: buffer_store_byte v1, v2, s[16:19], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s3 +; GCN-O0-NEXT: buffer_store_byte v0, v2, s[16:19], 0 offen offset:4 +; GCN-O0-NEXT: s_add_i32 s3, s3, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s3 +; GCN-O0-NEXT: buffer_store_byte v1, v2, s[16:19], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s3 +; GCN-O0-NEXT: buffer_store_byte v0, v2, s[16:19], 0 offen offset:2 +; GCN-O0-NEXT: s_add_i32 s3, s3, s5 +; GCN-O0-NEXT: s_add_i32 s3, s3, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s3 +; GCN-O0-NEXT: buffer_store_byte v1, v2, s[16:19], 0 offen +; GCN-O0-NEXT: buffer_store_byte v1, off, s[16:19], 0 offset:1 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:2 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:4 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:8 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:16 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:32 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:64 ; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: buffer_load_ubyte v0, v0, s[12:15], 0 offen -; GCN-O0-NEXT: s_mov_b32 s2, 1 +; GCN-O0-NEXT: buffer_load_ubyte v0, v0, s[16:19], 0 offen ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v2, v0, s2 +; GCN-O0-NEXT: v_and_b32_e64 v0, 1, v0 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, 1 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3] ; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 ; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 ; GCN-O0-NEXT: flat_store_dword v[0:1], v2 @@ -2793,8 +3132,8 @@ define double @double16_extelt_vec(i32 %sel) { ; GCN-O0: ; %bb.0: ; %entry ; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-O0-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] ; GCN-O0-NEXT: v_writelane_b32 v34, s36, 0 ; GCN-O0-NEXT: v_writelane_b32 v34, s37, 1 @@ -2962,8 +3301,11 @@ define double @double16_extelt_vec(i32 %sel) { ; GCN-O0-NEXT: v_writelane_b32 v35, s65, 30 ; GCN-O0-NEXT: v_writelane_b32 v35, s66, 31 ; GCN-O0-NEXT: v_writelane_b32 v35, s67, 32 +; GCN-O0-NEXT: s_mov_b32 s4, 2 +; GCN-O0-NEXT: v_mul_lo_u32 v0, v0, s4 +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b32 s4, 1 -; GCN-O0-NEXT: v_lshlrev_b32_e64 v0, s4, v0 +; GCN-O0-NEXT: v_add_u32_e64 v0, s[4:5], v0, s4 ; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; GCN-O0-NEXT: v_mov_b32_e32 v0, s36 ; GCN-O0-NEXT: v_mov_b32_e32 v1, s37 @@ -3044,7 +3386,7 @@ define double @double16_extelt_vec(i32 %sel) { ; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: v_readlane_b32 s4, v35, 35 ; GCN-O0-NEXT: v_readlane_b32 s5, v35, 36 -; GCN-O0-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload @@ -3083,9 +3425,9 @@ define double @double16_extelt_vec(i32 %sel) { ; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v32 ; GCN-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GCN-O0-NEXT: s_mov_b32 m0, s6 -; GCN-O0-NEXT: v_movrels_b32_e32 v0, v1 +; GCN-O0-NEXT: v_movrels_b32_e32 v0, v0 +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5] ; GCN-O0-NEXT: v_writelane_b32 v35, s6, 35 ; GCN-O0-NEXT: v_writelane_b32 v35, s7, 36 @@ -3171,38 +3513,38 @@ define double @double16_extelt_vec(i32 %sel) { ; GCN-O0-NEXT: v_mov_b32_e32 v29, s65 ; GCN-O0-NEXT: v_mov_b32_e32 v30, s66 ; GCN-O0-NEXT: v_mov_b32_e32 v31, s67 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b64 s[4:5], exec ; GCN-O0-NEXT: v_writelane_b32 v35, s4, 37 ; GCN-O0-NEXT: v_writelane_b32 v35, s5, 38 @@ -3218,48 +3560,48 @@ define double @double16_extelt_vec(i32 %sel) { ; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: v_readlane_b32 s4, v35, 39 ; GCN-O0-NEXT: v_readlane_b32 s5, v35, 40 -; GCN-O0-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: v_readfirstlane_b32 s6, v32 ; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v32 ; GCN-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GCN-O0-NEXT: s_mov_b32 m0, s6 ; GCN-O0-NEXT: v_movrels_b32_e32 v0, v0 +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5] ; GCN-O0-NEXT: v_writelane_b32 v35, s6, 39 ; GCN-O0-NEXT: v_writelane_b32 v35, s7, 40 @@ -3277,12 +3619,13 @@ define double @double16_extelt_vec(i32 %sel) { ; GCN-O0-NEXT: v_readlane_b32 s5, v35, 38 ; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] ; GCN-O0-NEXT: ; %bb.6: -; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GCN-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GCN-O0-NEXT: s_waitcnt vmcnt(1) -; GCN-O0-NEXT: v_mov_b32_e32 v1, v0 +; GCN-O0-NEXT: v_mov_b32_e32 v2, v0 ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_mov_b32_e32 v2, v3 +; GCN-O0-NEXT: v_mov_b32_e32 v0, v1 ; GCN-O0-NEXT: s_mov_b32 s4, 32 ; GCN-O0-NEXT: v_lshrrev_b64 v[1:2], s4, v[1:2] ; GCN-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $vgpr1_vgpr2 killed $exec @@ -3303,8 +3646,8 @@ define double @double16_extelt_vec(i32 %sel) { ; GCN-O0-NEXT: v_readlane_b32 s37, v34, 1 ; GCN-O0-NEXT: v_readlane_b32 s36, v34, 0 ; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-alloca-issue-155902.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-alloca-issue-155902.ll index 122b75acf400..2219c5a7133f 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch-alloca-issue-155902.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-alloca-issue-155902.ll @@ -9,6 +9,106 @@ define amdgpu_kernel void @issue155902(i64 %arg, i64 %arg1, i64 %arg2, i64 %arg3 ; GFX950-NEXT: ; implicit-def: $vgpr2 : SGPR spill to VGPR lane ; GFX950-NEXT: v_writelane_b32 v2, s33, 0 ; GFX950-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x188 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x180 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x178 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x170 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x168 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x160 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x158 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x150 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x148 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x140 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x138 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x130 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x128 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x120 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x118 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x110 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x108 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x100 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf8 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf0 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xe8 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xe0 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd8 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd0 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xc8 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xc0 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb8 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb0 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xa8 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xa0 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x98 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x90 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x88 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x80 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x78 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x70 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x68 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x60 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x58 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x50 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x48 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x40 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x8 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x10 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x18 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x20 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x28 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x38 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX950-NEXT: s_load_dwordx2 vcc, s[2:3], 0x8 ; GFX950-NEXT: s_load_dwordx2 s[98:99], s[2:3], 0x10 @@ -237,6 +337,106 @@ define amdgpu_kernel void @issue155902_fp(i64 %arg, i64 %arg1, i64 %arg2, i64 %a ; GFX950-NEXT: ; implicit-def: $vgpr2 : SGPR spill to VGPR lane ; GFX950-NEXT: v_writelane_b32 v2, s0, 0 ; GFX950-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x188 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x180 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x178 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x170 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x168 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x160 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x158 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x150 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x148 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x140 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x138 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x130 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x128 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x120 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x118 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x110 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x108 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x100 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xf8 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xf0 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xe8 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xe0 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd8 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd0 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xc8 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xc0 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xb8 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xb0 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xa8 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xa0 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x98 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x90 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x88 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x80 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x78 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x70 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x68 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x60 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x58 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x50 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x48 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x40 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x8 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x10 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x18 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x20 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x28 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x30 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x38 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX950-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-NEXT: v_writelane_b32 v2, s4, 1 diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll index 8fcf1ad3fbc9..a9e67ff3fdcb 100644 --- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll @@ -81,6 +81,10 @@ define amdgpu_kernel void @extract_w_offset(ptr addrspace(1) %out, i32 %in) { ; NOOPT-LABEL: extract_w_offset: ; NOOPT: ; %bb.0: ; %entry ; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_load_dword s0, s[4:5], 0xb +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; NOOPT-NEXT: s_load_dword s4, s[4:5], 0xb ; NOOPT-NEXT: s_waitcnt lgkmcnt(0) ; NOOPT-NEXT: s_mov_b32 s7, s1 @@ -332,6 +336,11 @@ define amdgpu_kernel void @extract_w_offset_salu_use_vector(ptr addrspace(1) %ou ; NOOPT: ; %bb.0: ; %entry ; NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] ; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_load_dword s0, s[2:3], 0xb +; NOOPT-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x19 +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; NOOPT-NEXT: s_load_dword s4, s[2:3], 0xb ; NOOPT-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0x19 ; NOOPT-NEXT: s_waitcnt lgkmcnt(0) @@ -691,6 +700,10 @@ define amdgpu_kernel void @extract_wo_offset(ptr addrspace(1) %out, i32 %in) { ; NOOPT-LABEL: extract_wo_offset: ; NOOPT: ; %bb.0: ; %entry ; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_load_dword s0, s[4:5], 0xb +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; NOOPT-NEXT: s_load_dword s4, s[4:5], 0xb ; NOOPT-NEXT: s_waitcnt lgkmcnt(0) ; NOOPT-NEXT: s_mov_b32 s7, s1 @@ -918,6 +931,10 @@ define amdgpu_kernel void @extract_neg_offset_sgpr(ptr addrspace(1) %out, i32 %o ; NOOPT-LABEL: extract_neg_offset_sgpr: ; NOOPT: ; %bb.0: ; %entry ; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_load_dword s0, s[4:5], 0xb +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; NOOPT-NEXT: s_load_dword s4, s[4:5], 0xb ; NOOPT-NEXT: s_waitcnt lgkmcnt(0) ; NOOPT-NEXT: s_mov_b32 s7, s1 @@ -1166,6 +1183,12 @@ define amdgpu_kernel void @extract_neg_offset_sgpr_loaded(ptr addrspace(1) %out, ; NOOPT-LABEL: extract_neg_offset_sgpr_loaded: ; NOOPT: ; %bb.0: ; %entry ; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; NOOPT-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x19 +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x29 +; NOOPT-NEXT: s_load_dword s0, s[4:5], 0x39 +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; NOOPT-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x19 ; NOOPT-NEXT: s_load_dwordx16 s[52:67], s[4:5], 0x29 ; NOOPT-NEXT: s_load_dword s4, s[4:5], 0x39 @@ -1509,6 +1532,8 @@ define amdgpu_kernel void @extract_neg_offset_vgpr(ptr addrspace(1) %out) { ; NOOPT-NEXT: buffer_store_dword v0, off, s[20:23], 0 offset:68 ; 4-byte Folded Spill ; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) ; NOOPT-NEXT: s_mov_b32 s6, s1 ; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 ; NOOPT-NEXT: s_mov_b32 s4, 0xf000 @@ -1802,18 +1827,32 @@ define amdgpu_kernel void @extract_undef_offset_sgpr(ptr addrspace(1) %out, ptr ; ; NOOPT-LABEL: extract_undef_offset_sgpr: ; NOOPT: ; %bb.0: ; %entry +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) ; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb ; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; NOOPT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xb +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) ; NOOPT-NEXT: s_mov_b32 s6, s1 ; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 -; NOOPT-NEXT: s_mov_b32 s4, 0xf000 -; NOOPT-NEXT: s_mov_b32 s5, -1 +; NOOPT-NEXT: s_mov_b32 s8, 0xf000 +; NOOPT-NEXT: s_mov_b32 s9, -1 ; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 ; NOOPT-NEXT: s_mov_b32 s1, s6 -; NOOPT-NEXT: s_mov_b32 s2, s5 -; NOOPT-NEXT: s_mov_b32 s3, s4 -; NOOPT-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc +; NOOPT-NEXT: s_mov_b32 s2, s9 +; NOOPT-NEXT: s_mov_b32 s3, s8 +; NOOPT-NEXT: s_mov_b32 s10, s5 +; NOOPT-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; NOOPT-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; NOOPT-NEXT: s_mov_b32 s5, s10 +; NOOPT-NEXT: s_mov_b32 s6, s9 +; NOOPT-NEXT: s_mov_b32 s7, s8 +; NOOPT-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 glc ; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: v_mov_b32_e32 v0, s4 +; NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; NOOPT-NEXT: s_endpgm ; ; SI-MOVREL-LABEL: extract_undef_offset_sgpr: @@ -1861,6 +1900,29 @@ define amdgpu_kernel void @insert_undef_offset_sgpr_vector_src(ptr addrspace(1) ; ; NOOPT-LABEL: insert_undef_offset_sgpr_vector_src: ; NOOPT: ; %bb.0: ; %entry +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; NOOPT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xb +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_mov_b32 s8, s1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; NOOPT-NEXT: s_mov_b32 s6, 0xf000 +; NOOPT-NEXT: s_mov_b32 s7, -1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; NOOPT-NEXT: s_mov_b32 s1, s8 +; NOOPT-NEXT: s_mov_b32 s2, s7 +; NOOPT-NEXT: s_mov_b32 s3, s6 +; NOOPT-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; NOOPT-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v0, s4 +; NOOPT-NEXT: v_mov_b32_e32 v1, s5 +; NOOPT-NEXT: v_mov_b32_e32 v2, s6 +; NOOPT-NEXT: v_mov_b32_e32 v3, s7 +; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; NOOPT-NEXT: s_endpgm ; ; SI-MOVREL-LABEL: insert_undef_offset_sgpr_vector_src: @@ -1962,51 +2024,55 @@ define amdgpu_kernel void @insert_w_offset(ptr addrspace(1) %out, i32 %in) { ; ; NOOPT-LABEL: insert_w_offset: ; NOOPT: ; %bb.0: ; %entry -; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; NOOPT-NEXT: s_load_dword s4, s[4:5], 0xb +; NOOPT-NEXT: s_mov_b64 s[0:1], s[4:5] +; NOOPT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; NOOPT-NEXT: s_waitcnt lgkmcnt(0) -; NOOPT-NEXT: s_mov_b32 s7, s1 -; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 -; NOOPT-NEXT: s_mov_b32 s5, 0xf000 -; NOOPT-NEXT: s_mov_b32 s6, -1 +; NOOPT-NEXT: s_load_dword s2, s[0:1], 0xb +; NOOPT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; NOOPT-NEXT: s_load_dword s7, s[0:1], 0xb +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_mov_b32 s9, s5 +; NOOPT-NEXT: s_mov_b32 s0, s4 +; NOOPT-NEXT: s_mov_b32 s6, 0xf000 +; NOOPT-NEXT: s_mov_b32 s8, -1 ; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 -; NOOPT-NEXT: s_mov_b32 s1, s7 -; NOOPT-NEXT: s_mov_b32 s2, s6 -; NOOPT-NEXT: s_mov_b32 s3, s5 -; NOOPT-NEXT: s_mov_b32 s5, 1 -; NOOPT-NEXT: s_add_i32 s4, s4, s5 -; NOOPT-NEXT: s_mov_b32 s5, 0x41800000 -; NOOPT-NEXT: s_mov_b32 s6, 0x41700000 -; NOOPT-NEXT: s_mov_b32 s7, 0x41600000 -; NOOPT-NEXT: s_mov_b32 s8, 0x41500000 -; NOOPT-NEXT: s_mov_b32 s9, 0x41400000 -; NOOPT-NEXT: s_mov_b32 s10, 0x41300000 -; NOOPT-NEXT: s_mov_b32 s11, 0x41200000 -; NOOPT-NEXT: s_mov_b32 s12, 0x41100000 -; NOOPT-NEXT: s_mov_b32 s13, 0x41000000 -; NOOPT-NEXT: s_mov_b32 s14, 0x40e00000 -; NOOPT-NEXT: s_mov_b32 s15, 0x40c00000 -; NOOPT-NEXT: s_mov_b32 s16, 0x40a00000 -; NOOPT-NEXT: s_mov_b32 s17, 4.0 -; NOOPT-NEXT: s_mov_b32 s18, 0x40400000 -; NOOPT-NEXT: s_mov_b32 s19, 2.0 -; NOOPT-NEXT: s_mov_b32 s20, 1.0 -; NOOPT-NEXT: v_mov_b32_e32 v7, s20 -; NOOPT-NEXT: v_mov_b32_e32 v30, s19 -; NOOPT-NEXT: v_mov_b32_e32 v29, s18 -; NOOPT-NEXT: v_mov_b32_e32 v28, s17 -; NOOPT-NEXT: v_mov_b32_e32 v27, s16 -; NOOPT-NEXT: v_mov_b32_e32 v26, s15 -; NOOPT-NEXT: v_mov_b32_e32 v25, s14 -; NOOPT-NEXT: v_mov_b32_e32 v24, s13 -; NOOPT-NEXT: v_mov_b32_e32 v23, s12 -; NOOPT-NEXT: v_mov_b32_e32 v6, s11 -; NOOPT-NEXT: v_mov_b32_e32 v5, s10 -; NOOPT-NEXT: v_mov_b32_e32 v4, s9 -; NOOPT-NEXT: v_mov_b32_e32 v3, s8 -; NOOPT-NEXT: v_mov_b32_e32 v2, s7 -; NOOPT-NEXT: v_mov_b32_e32 v1, s6 -; NOOPT-NEXT: v_mov_b32_e32 v0, s5 +; NOOPT-NEXT: s_mov_b32 s1, s9 +; NOOPT-NEXT: s_mov_b32 s2, s8 +; NOOPT-NEXT: s_mov_b32 s3, s6 +; NOOPT-NEXT: s_mov_b32 s8, 1 +; NOOPT-NEXT: s_add_i32 s7, s7, s8 +; NOOPT-NEXT: s_mov_b32 s8, 0x41800000 +; NOOPT-NEXT: s_mov_b32 s9, 0x41700000 +; NOOPT-NEXT: s_mov_b32 s10, 0x41600000 +; NOOPT-NEXT: s_mov_b32 s11, 0x41500000 +; NOOPT-NEXT: s_mov_b32 s12, 0x41400000 +; NOOPT-NEXT: s_mov_b32 s13, 0x41300000 +; NOOPT-NEXT: s_mov_b32 s14, 0x41200000 +; NOOPT-NEXT: s_mov_b32 s15, 0x41100000 +; NOOPT-NEXT: s_mov_b32 s16, 0x41000000 +; NOOPT-NEXT: s_mov_b32 s17, 0x40e00000 +; NOOPT-NEXT: s_mov_b32 s18, 0x40c00000 +; NOOPT-NEXT: s_mov_b32 s19, 0x40a00000 +; NOOPT-NEXT: s_mov_b32 s20, 4.0 +; NOOPT-NEXT: s_mov_b32 s21, 0x40400000 +; NOOPT-NEXT: s_mov_b32 s22, 2.0 +; NOOPT-NEXT: s_mov_b32 s23, 1.0 +; NOOPT-NEXT: v_mov_b32_e32 v7, s23 +; NOOPT-NEXT: v_mov_b32_e32 v30, s22 +; NOOPT-NEXT: v_mov_b32_e32 v29, s21 +; NOOPT-NEXT: v_mov_b32_e32 v28, s20 +; NOOPT-NEXT: v_mov_b32_e32 v27, s19 +; NOOPT-NEXT: v_mov_b32_e32 v26, s18 +; NOOPT-NEXT: v_mov_b32_e32 v25, s17 +; NOOPT-NEXT: v_mov_b32_e32 v24, s16 +; NOOPT-NEXT: v_mov_b32_e32 v23, s15 +; NOOPT-NEXT: v_mov_b32_e32 v6, s14 +; NOOPT-NEXT: v_mov_b32_e32 v5, s13 +; NOOPT-NEXT: v_mov_b32_e32 v4, s12 +; NOOPT-NEXT: v_mov_b32_e32 v3, s11 +; NOOPT-NEXT: v_mov_b32_e32 v2, s10 +; NOOPT-NEXT: v_mov_b32_e32 v1, s9 +; NOOPT-NEXT: v_mov_b32_e32 v0, s8 ; NOOPT-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22 killed $exec ; NOOPT-NEXT: v_mov_b32_e32 v8, v30 ; NOOPT-NEXT: v_mov_b32_e32 v9, v29 @@ -2024,7 +2090,7 @@ define amdgpu_kernel void @insert_w_offset(ptr addrspace(1) %out, i32 %in) { ; NOOPT-NEXT: v_mov_b32_e32 v21, v1 ; NOOPT-NEXT: v_mov_b32_e32 v22, v0 ; NOOPT-NEXT: v_mov_b32_e32 v0, 0x41880000 -; NOOPT-NEXT: s_mov_b32 m0, s4 +; NOOPT-NEXT: s_mov_b32 m0, s7 ; NOOPT-NEXT: v_movreld_b32_e32 v7, v0 ; NOOPT-NEXT: v_mov_b32_e32 v4, v22 ; NOOPT-NEXT: v_mov_b32_e32 v5, v21 @@ -2034,7 +2100,14 @@ define amdgpu_kernel void @insert_w_offset(ptr addrspace(1) %out, i32 %in) { ; NOOPT-NEXT: v_mov_b32_e32 v1, v6 ; NOOPT-NEXT: v_mov_b32_e32 v2, v5 ; NOOPT-NEXT: v_mov_b32_e32 v3, v4 -; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; NOOPT-NEXT: s_mov_b32 s8, 0 +; NOOPT-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9 +; NOOPT-NEXT: s_mov_b32 s9, s6 +; NOOPT-NEXT: ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7 +; NOOPT-NEXT: s_mov_b64 s[6:7], s[8:9] +; NOOPT-NEXT: v_mov_b32_e32 v4, 32 +; NOOPT-NEXT: v_mov_b32_e32 v5, 0 +; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64 offset:16 ; NOOPT-NEXT: v_mov_b32_e32 v4, v18 ; NOOPT-NEXT: v_mov_b32_e32 v5, v17 ; NOOPT-NEXT: v_mov_b32_e32 v6, v16 @@ -2316,51 +2389,62 @@ define amdgpu_kernel void @insert_unsigned_base_plus_offset(ptr addrspace(1) %ou ; ; NOOPT-LABEL: insert_unsigned_base_plus_offset: ; NOOPT: ; %bb.0: ; %entry -; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; NOOPT-NEXT: s_load_dword s4, s[4:5], 0xb +; NOOPT-NEXT: s_mov_b64 s[0:1], s[4:5] +; NOOPT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; NOOPT-NEXT: s_waitcnt lgkmcnt(0) -; NOOPT-NEXT: s_mov_b32 s7, s1 -; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 -; NOOPT-NEXT: s_mov_b32 s5, 0xf000 -; NOOPT-NEXT: s_mov_b32 s6, -1 +; NOOPT-NEXT: s_mov_b32 s2, s1 +; NOOPT-NEXT: s_mov_b32 s12, s0 +; NOOPT-NEXT: s_mov_b32 s6, 0xf000 +; NOOPT-NEXT: s_mov_b32 s8, -1 +; NOOPT-NEXT: ; kill: def $sgpr12 killed $sgpr12 def $sgpr12_sgpr13_sgpr14_sgpr15 +; NOOPT-NEXT: s_mov_b32 s13, s2 +; NOOPT-NEXT: s_mov_b32 s14, s8 +; NOOPT-NEXT: s_mov_b32 s15, s6 +; NOOPT-NEXT: buffer_load_ushort v0, off, s[12:15], 0 offset:44 +; NOOPT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; NOOPT-NEXT: s_load_dword s7, s[0:1], 0xb +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_mov_b32 s9, s5 +; NOOPT-NEXT: s_mov_b32 s0, s4 ; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 -; NOOPT-NEXT: s_mov_b32 s1, s7 -; NOOPT-NEXT: s_mov_b32 s2, s6 -; NOOPT-NEXT: s_mov_b32 s3, s5 -; NOOPT-NEXT: s_mov_b32 s5, 0xffff -; NOOPT-NEXT: s_and_b32 s4, s4, s5 -; NOOPT-NEXT: s_mov_b32 s5, 0x41800000 -; NOOPT-NEXT: s_mov_b32 s6, 0x41700000 -; NOOPT-NEXT: s_mov_b32 s7, 0x41600000 -; NOOPT-NEXT: s_mov_b32 s8, 0x41500000 -; NOOPT-NEXT: s_mov_b32 s9, 0x41400000 -; NOOPT-NEXT: s_mov_b32 s10, 0x41300000 -; NOOPT-NEXT: s_mov_b32 s11, 0x41200000 -; NOOPT-NEXT: s_mov_b32 s12, 0x41100000 -; NOOPT-NEXT: s_mov_b32 s13, 0x41000000 -; NOOPT-NEXT: s_mov_b32 s14, 0x40e00000 -; NOOPT-NEXT: s_mov_b32 s15, 0x40c00000 -; NOOPT-NEXT: s_mov_b32 s16, 0x40a00000 -; NOOPT-NEXT: s_mov_b32 s17, 4.0 -; NOOPT-NEXT: s_mov_b32 s18, 0x40400000 -; NOOPT-NEXT: s_mov_b32 s19, 2.0 -; NOOPT-NEXT: s_mov_b32 s20, 1.0 -; NOOPT-NEXT: v_mov_b32_e32 v7, s20 -; NOOPT-NEXT: v_mov_b32_e32 v30, s19 -; NOOPT-NEXT: v_mov_b32_e32 v29, s18 -; NOOPT-NEXT: v_mov_b32_e32 v28, s17 -; NOOPT-NEXT: v_mov_b32_e32 v27, s16 -; NOOPT-NEXT: v_mov_b32_e32 v26, s15 -; NOOPT-NEXT: v_mov_b32_e32 v25, s14 -; NOOPT-NEXT: v_mov_b32_e32 v24, s13 -; NOOPT-NEXT: v_mov_b32_e32 v23, s12 -; NOOPT-NEXT: v_mov_b32_e32 v6, s11 -; NOOPT-NEXT: v_mov_b32_e32 v5, s10 -; NOOPT-NEXT: v_mov_b32_e32 v4, s9 -; NOOPT-NEXT: v_mov_b32_e32 v3, s8 -; NOOPT-NEXT: v_mov_b32_e32 v2, s7 -; NOOPT-NEXT: v_mov_b32_e32 v1, s6 -; NOOPT-NEXT: v_mov_b32_e32 v0, s5 +; NOOPT-NEXT: s_mov_b32 s1, s9 +; NOOPT-NEXT: s_mov_b32 s2, s8 +; NOOPT-NEXT: s_mov_b32 s3, s6 +; NOOPT-NEXT: s_mov_b32 s8, 0xffff +; NOOPT-NEXT: s_and_b32 s7, s7, s8 +; NOOPT-NEXT: s_mov_b32 s8, 0x41800000 +; NOOPT-NEXT: s_mov_b32 s9, 0x41700000 +; NOOPT-NEXT: s_mov_b32 s10, 0x41600000 +; NOOPT-NEXT: s_mov_b32 s11, 0x41500000 +; NOOPT-NEXT: s_mov_b32 s12, 0x41400000 +; NOOPT-NEXT: s_mov_b32 s13, 0x41300000 +; NOOPT-NEXT: s_mov_b32 s14, 0x41200000 +; NOOPT-NEXT: s_mov_b32 s15, 0x41100000 +; NOOPT-NEXT: s_mov_b32 s16, 0x41000000 +; NOOPT-NEXT: s_mov_b32 s17, 0x40e00000 +; NOOPT-NEXT: s_mov_b32 s18, 0x40c00000 +; NOOPT-NEXT: s_mov_b32 s19, 0x40a00000 +; NOOPT-NEXT: s_mov_b32 s20, 4.0 +; NOOPT-NEXT: s_mov_b32 s21, 0x40400000 +; NOOPT-NEXT: s_mov_b32 s22, 2.0 +; NOOPT-NEXT: s_mov_b32 s23, 1.0 +; NOOPT-NEXT: v_mov_b32_e32 v7, s23 +; NOOPT-NEXT: v_mov_b32_e32 v30, s22 +; NOOPT-NEXT: v_mov_b32_e32 v29, s21 +; NOOPT-NEXT: v_mov_b32_e32 v28, s20 +; NOOPT-NEXT: v_mov_b32_e32 v27, s19 +; NOOPT-NEXT: v_mov_b32_e32 v26, s18 +; NOOPT-NEXT: v_mov_b32_e32 v25, s17 +; NOOPT-NEXT: v_mov_b32_e32 v24, s16 +; NOOPT-NEXT: v_mov_b32_e32 v23, s15 +; NOOPT-NEXT: v_mov_b32_e32 v6, s14 +; NOOPT-NEXT: v_mov_b32_e32 v5, s13 +; NOOPT-NEXT: v_mov_b32_e32 v4, s12 +; NOOPT-NEXT: v_mov_b32_e32 v3, s11 +; NOOPT-NEXT: v_mov_b32_e32 v2, s10 +; NOOPT-NEXT: v_mov_b32_e32 v1, s9 +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v0, s8 ; NOOPT-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22 killed $exec ; NOOPT-NEXT: v_mov_b32_e32 v8, v30 ; NOOPT-NEXT: v_mov_b32_e32 v9, v29 @@ -2378,7 +2462,7 @@ define amdgpu_kernel void @insert_unsigned_base_plus_offset(ptr addrspace(1) %ou ; NOOPT-NEXT: v_mov_b32_e32 v21, v1 ; NOOPT-NEXT: v_mov_b32_e32 v22, v0 ; NOOPT-NEXT: v_mov_b32_e32 v0, 0x41880000 -; NOOPT-NEXT: s_mov_b32 m0, s4 +; NOOPT-NEXT: s_mov_b32 m0, s7 ; NOOPT-NEXT: v_movreld_b32_e32 v8, v0 ; NOOPT-NEXT: v_mov_b32_e32 v4, v22 ; NOOPT-NEXT: v_mov_b32_e32 v5, v21 @@ -2388,7 +2472,14 @@ define amdgpu_kernel void @insert_unsigned_base_plus_offset(ptr addrspace(1) %ou ; NOOPT-NEXT: v_mov_b32_e32 v1, v6 ; NOOPT-NEXT: v_mov_b32_e32 v2, v5 ; NOOPT-NEXT: v_mov_b32_e32 v3, v4 -; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; NOOPT-NEXT: s_mov_b32 s8, 0 +; NOOPT-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9 +; NOOPT-NEXT: s_mov_b32 s9, s6 +; NOOPT-NEXT: ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7 +; NOOPT-NEXT: s_mov_b64 s[6:7], s[8:9] +; NOOPT-NEXT: v_mov_b32_e32 v4, 32 +; NOOPT-NEXT: v_mov_b32_e32 v5, 0 +; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64 offset:16 ; NOOPT-NEXT: v_mov_b32_e32 v4, v18 ; NOOPT-NEXT: v_mov_b32_e32 v5, v17 ; NOOPT-NEXT: v_mov_b32_e32 v6, v16 @@ -2671,52 +2762,63 @@ define amdgpu_kernel void @insert_signed_base_plus_offset(ptr addrspace(1) %out, ; ; NOOPT-LABEL: insert_signed_base_plus_offset: ; NOOPT: ; %bb.0: ; %entry -; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; NOOPT-NEXT: s_load_dword s4, s[4:5], 0xb +; NOOPT-NEXT: s_mov_b64 s[0:1], s[4:5] +; NOOPT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; NOOPT-NEXT: s_waitcnt lgkmcnt(0) -; NOOPT-NEXT: s_mov_b32 s7, s1 -; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 -; NOOPT-NEXT: s_mov_b32 s5, 0xf000 -; NOOPT-NEXT: s_mov_b32 s6, -1 +; NOOPT-NEXT: s_mov_b32 s2, s1 +; NOOPT-NEXT: s_mov_b32 s12, s0 +; NOOPT-NEXT: s_mov_b32 s6, 0xf000 +; NOOPT-NEXT: s_mov_b32 s8, -1 +; NOOPT-NEXT: ; kill: def $sgpr12 killed $sgpr12 def $sgpr12_sgpr13_sgpr14_sgpr15 +; NOOPT-NEXT: s_mov_b32 s13, s2 +; NOOPT-NEXT: s_mov_b32 s14, s8 +; NOOPT-NEXT: s_mov_b32 s15, s6 +; NOOPT-NEXT: buffer_load_ushort v0, off, s[12:15], 0 offset:44 +; NOOPT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; NOOPT-NEXT: s_load_dword s7, s[0:1], 0xb +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_mov_b32 s9, s5 +; NOOPT-NEXT: s_mov_b32 s0, s4 ; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 -; NOOPT-NEXT: s_mov_b32 s1, s7 -; NOOPT-NEXT: s_mov_b32 s2, s6 -; NOOPT-NEXT: s_mov_b32 s3, s5 -; NOOPT-NEXT: s_sext_i32_i16 s4, s4 -; NOOPT-NEXT: s_mov_b32 s5, 1 -; NOOPT-NEXT: s_add_i32 s4, s4, s5 -; NOOPT-NEXT: s_mov_b32 s5, 0x41800000 -; NOOPT-NEXT: s_mov_b32 s6, 0x41700000 -; NOOPT-NEXT: s_mov_b32 s7, 0x41600000 -; NOOPT-NEXT: s_mov_b32 s8, 0x41500000 -; NOOPT-NEXT: s_mov_b32 s9, 0x41400000 -; NOOPT-NEXT: s_mov_b32 s10, 0x41300000 -; NOOPT-NEXT: s_mov_b32 s11, 0x41200000 -; NOOPT-NEXT: s_mov_b32 s12, 0x41100000 -; NOOPT-NEXT: s_mov_b32 s13, 0x41000000 -; NOOPT-NEXT: s_mov_b32 s14, 0x40e00000 -; NOOPT-NEXT: s_mov_b32 s15, 0x40c00000 -; NOOPT-NEXT: s_mov_b32 s16, 0x40a00000 -; NOOPT-NEXT: s_mov_b32 s17, 4.0 -; NOOPT-NEXT: s_mov_b32 s18, 0x40400000 -; NOOPT-NEXT: s_mov_b32 s19, 2.0 -; NOOPT-NEXT: s_mov_b32 s20, 1.0 -; NOOPT-NEXT: v_mov_b32_e32 v7, s20 -; NOOPT-NEXT: v_mov_b32_e32 v30, s19 -; NOOPT-NEXT: v_mov_b32_e32 v29, s18 -; NOOPT-NEXT: v_mov_b32_e32 v28, s17 -; NOOPT-NEXT: v_mov_b32_e32 v27, s16 -; NOOPT-NEXT: v_mov_b32_e32 v26, s15 -; NOOPT-NEXT: v_mov_b32_e32 v25, s14 -; NOOPT-NEXT: v_mov_b32_e32 v24, s13 -; NOOPT-NEXT: v_mov_b32_e32 v23, s12 -; NOOPT-NEXT: v_mov_b32_e32 v6, s11 -; NOOPT-NEXT: v_mov_b32_e32 v5, s10 -; NOOPT-NEXT: v_mov_b32_e32 v4, s9 -; NOOPT-NEXT: v_mov_b32_e32 v3, s8 -; NOOPT-NEXT: v_mov_b32_e32 v2, s7 -; NOOPT-NEXT: v_mov_b32_e32 v1, s6 -; NOOPT-NEXT: v_mov_b32_e32 v0, s5 +; NOOPT-NEXT: s_mov_b32 s1, s9 +; NOOPT-NEXT: s_mov_b32 s2, s8 +; NOOPT-NEXT: s_mov_b32 s3, s6 +; NOOPT-NEXT: s_sext_i32_i16 s7, s7 +; NOOPT-NEXT: s_mov_b32 s8, 1 +; NOOPT-NEXT: s_add_i32 s7, s7, s8 +; NOOPT-NEXT: s_mov_b32 s8, 0x41800000 +; NOOPT-NEXT: s_mov_b32 s9, 0x41700000 +; NOOPT-NEXT: s_mov_b32 s10, 0x41600000 +; NOOPT-NEXT: s_mov_b32 s11, 0x41500000 +; NOOPT-NEXT: s_mov_b32 s12, 0x41400000 +; NOOPT-NEXT: s_mov_b32 s13, 0x41300000 +; NOOPT-NEXT: s_mov_b32 s14, 0x41200000 +; NOOPT-NEXT: s_mov_b32 s15, 0x41100000 +; NOOPT-NEXT: s_mov_b32 s16, 0x41000000 +; NOOPT-NEXT: s_mov_b32 s17, 0x40e00000 +; NOOPT-NEXT: s_mov_b32 s18, 0x40c00000 +; NOOPT-NEXT: s_mov_b32 s19, 0x40a00000 +; NOOPT-NEXT: s_mov_b32 s20, 4.0 +; NOOPT-NEXT: s_mov_b32 s21, 0x40400000 +; NOOPT-NEXT: s_mov_b32 s22, 2.0 +; NOOPT-NEXT: s_mov_b32 s23, 1.0 +; NOOPT-NEXT: v_mov_b32_e32 v7, s23 +; NOOPT-NEXT: v_mov_b32_e32 v30, s22 +; NOOPT-NEXT: v_mov_b32_e32 v29, s21 +; NOOPT-NEXT: v_mov_b32_e32 v28, s20 +; NOOPT-NEXT: v_mov_b32_e32 v27, s19 +; NOOPT-NEXT: v_mov_b32_e32 v26, s18 +; NOOPT-NEXT: v_mov_b32_e32 v25, s17 +; NOOPT-NEXT: v_mov_b32_e32 v24, s16 +; NOOPT-NEXT: v_mov_b32_e32 v23, s15 +; NOOPT-NEXT: v_mov_b32_e32 v6, s14 +; NOOPT-NEXT: v_mov_b32_e32 v5, s13 +; NOOPT-NEXT: v_mov_b32_e32 v4, s12 +; NOOPT-NEXT: v_mov_b32_e32 v3, s11 +; NOOPT-NEXT: v_mov_b32_e32 v2, s10 +; NOOPT-NEXT: v_mov_b32_e32 v1, s9 +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v0, s8 ; NOOPT-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22 killed $exec ; NOOPT-NEXT: v_mov_b32_e32 v8, v30 ; NOOPT-NEXT: v_mov_b32_e32 v9, v29 @@ -2734,7 +2836,7 @@ define amdgpu_kernel void @insert_signed_base_plus_offset(ptr addrspace(1) %out, ; NOOPT-NEXT: v_mov_b32_e32 v21, v1 ; NOOPT-NEXT: v_mov_b32_e32 v22, v0 ; NOOPT-NEXT: v_mov_b32_e32 v0, 0x41880000 -; NOOPT-NEXT: s_mov_b32 m0, s4 +; NOOPT-NEXT: s_mov_b32 m0, s7 ; NOOPT-NEXT: v_movreld_b32_e32 v7, v0 ; NOOPT-NEXT: v_mov_b32_e32 v4, v22 ; NOOPT-NEXT: v_mov_b32_e32 v5, v21 @@ -2744,7 +2846,14 @@ define amdgpu_kernel void @insert_signed_base_plus_offset(ptr addrspace(1) %out, ; NOOPT-NEXT: v_mov_b32_e32 v1, v6 ; NOOPT-NEXT: v_mov_b32_e32 v2, v5 ; NOOPT-NEXT: v_mov_b32_e32 v3, v4 -; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; NOOPT-NEXT: s_mov_b32 s8, 0 +; NOOPT-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9 +; NOOPT-NEXT: s_mov_b32 s9, s6 +; NOOPT-NEXT: ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7 +; NOOPT-NEXT: s_mov_b64 s[6:7], s[8:9] +; NOOPT-NEXT: v_mov_b32_e32 v4, 32 +; NOOPT-NEXT: v_mov_b32_e32 v5, 0 +; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64 offset:16 ; NOOPT-NEXT: v_mov_b32_e32 v4, v18 ; NOOPT-NEXT: v_mov_b32_e32 v5, v17 ; NOOPT-NEXT: v_mov_b32_e32 v6, v16 @@ -3031,49 +3140,53 @@ define amdgpu_kernel void @insert_wo_offset(ptr addrspace(1) %out, i32 %in) { ; ; NOOPT-LABEL: insert_wo_offset: ; NOOPT: ; %bb.0: ; %entry -; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; NOOPT-NEXT: s_load_dword s4, s[4:5], 0xb +; NOOPT-NEXT: s_mov_b64 s[0:1], s[4:5] +; NOOPT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; NOOPT-NEXT: s_waitcnt lgkmcnt(0) -; NOOPT-NEXT: s_mov_b32 s7, s1 -; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 -; NOOPT-NEXT: s_mov_b32 s5, 0xf000 -; NOOPT-NEXT: s_mov_b32 s6, -1 +; NOOPT-NEXT: s_load_dword s2, s[0:1], 0xb +; NOOPT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; NOOPT-NEXT: s_load_dword s7, s[0:1], 0xb +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_mov_b32 s9, s5 +; NOOPT-NEXT: s_mov_b32 s0, s4 +; NOOPT-NEXT: s_mov_b32 s6, 0xf000 +; NOOPT-NEXT: s_mov_b32 s8, -1 ; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 -; NOOPT-NEXT: s_mov_b32 s1, s7 -; NOOPT-NEXT: s_mov_b32 s2, s6 -; NOOPT-NEXT: s_mov_b32 s3, s5 -; NOOPT-NEXT: s_mov_b32 s5, 0x41800000 -; NOOPT-NEXT: s_mov_b32 s6, 0x41700000 -; NOOPT-NEXT: s_mov_b32 s7, 0x41600000 -; NOOPT-NEXT: s_mov_b32 s8, 0x41500000 -; NOOPT-NEXT: s_mov_b32 s9, 0x41400000 -; NOOPT-NEXT: s_mov_b32 s10, 0x41300000 -; NOOPT-NEXT: s_mov_b32 s11, 0x41200000 -; NOOPT-NEXT: s_mov_b32 s12, 0x41100000 -; NOOPT-NEXT: s_mov_b32 s13, 0x41000000 -; NOOPT-NEXT: s_mov_b32 s14, 0x40e00000 -; NOOPT-NEXT: s_mov_b32 s15, 0x40c00000 -; NOOPT-NEXT: s_mov_b32 s16, 0x40a00000 -; NOOPT-NEXT: s_mov_b32 s17, 4.0 -; NOOPT-NEXT: s_mov_b32 s18, 0x40400000 -; NOOPT-NEXT: s_mov_b32 s19, 2.0 -; NOOPT-NEXT: s_mov_b32 s20, 1.0 -; NOOPT-NEXT: v_mov_b32_e32 v7, s20 -; NOOPT-NEXT: v_mov_b32_e32 v30, s19 -; NOOPT-NEXT: v_mov_b32_e32 v29, s18 -; NOOPT-NEXT: v_mov_b32_e32 v28, s17 -; NOOPT-NEXT: v_mov_b32_e32 v27, s16 -; NOOPT-NEXT: v_mov_b32_e32 v26, s15 -; NOOPT-NEXT: v_mov_b32_e32 v25, s14 -; NOOPT-NEXT: v_mov_b32_e32 v24, s13 -; NOOPT-NEXT: v_mov_b32_e32 v23, s12 -; NOOPT-NEXT: v_mov_b32_e32 v6, s11 -; NOOPT-NEXT: v_mov_b32_e32 v5, s10 -; NOOPT-NEXT: v_mov_b32_e32 v4, s9 -; NOOPT-NEXT: v_mov_b32_e32 v3, s8 -; NOOPT-NEXT: v_mov_b32_e32 v2, s7 -; NOOPT-NEXT: v_mov_b32_e32 v1, s6 -; NOOPT-NEXT: v_mov_b32_e32 v0, s5 +; NOOPT-NEXT: s_mov_b32 s1, s9 +; NOOPT-NEXT: s_mov_b32 s2, s8 +; NOOPT-NEXT: s_mov_b32 s3, s6 +; NOOPT-NEXT: s_mov_b32 s8, 0x41800000 +; NOOPT-NEXT: s_mov_b32 s9, 0x41700000 +; NOOPT-NEXT: s_mov_b32 s10, 0x41600000 +; NOOPT-NEXT: s_mov_b32 s11, 0x41500000 +; NOOPT-NEXT: s_mov_b32 s12, 0x41400000 +; NOOPT-NEXT: s_mov_b32 s13, 0x41300000 +; NOOPT-NEXT: s_mov_b32 s14, 0x41200000 +; NOOPT-NEXT: s_mov_b32 s15, 0x41100000 +; NOOPT-NEXT: s_mov_b32 s16, 0x41000000 +; NOOPT-NEXT: s_mov_b32 s17, 0x40e00000 +; NOOPT-NEXT: s_mov_b32 s18, 0x40c00000 +; NOOPT-NEXT: s_mov_b32 s19, 0x40a00000 +; NOOPT-NEXT: s_mov_b32 s20, 4.0 +; NOOPT-NEXT: s_mov_b32 s21, 0x40400000 +; NOOPT-NEXT: s_mov_b32 s22, 2.0 +; NOOPT-NEXT: s_mov_b32 s23, 1.0 +; NOOPT-NEXT: v_mov_b32_e32 v7, s23 +; NOOPT-NEXT: v_mov_b32_e32 v30, s22 +; NOOPT-NEXT: v_mov_b32_e32 v29, s21 +; NOOPT-NEXT: v_mov_b32_e32 v28, s20 +; NOOPT-NEXT: v_mov_b32_e32 v27, s19 +; NOOPT-NEXT: v_mov_b32_e32 v26, s18 +; NOOPT-NEXT: v_mov_b32_e32 v25, s17 +; NOOPT-NEXT: v_mov_b32_e32 v24, s16 +; NOOPT-NEXT: v_mov_b32_e32 v23, s15 +; NOOPT-NEXT: v_mov_b32_e32 v6, s14 +; NOOPT-NEXT: v_mov_b32_e32 v5, s13 +; NOOPT-NEXT: v_mov_b32_e32 v4, s12 +; NOOPT-NEXT: v_mov_b32_e32 v3, s11 +; NOOPT-NEXT: v_mov_b32_e32 v2, s10 +; NOOPT-NEXT: v_mov_b32_e32 v1, s9 +; NOOPT-NEXT: v_mov_b32_e32 v0, s8 ; NOOPT-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22 killed $exec ; NOOPT-NEXT: v_mov_b32_e32 v8, v30 ; NOOPT-NEXT: v_mov_b32_e32 v9, v29 @@ -3091,7 +3204,7 @@ define amdgpu_kernel void @insert_wo_offset(ptr addrspace(1) %out, i32 %in) { ; NOOPT-NEXT: v_mov_b32_e32 v21, v1 ; NOOPT-NEXT: v_mov_b32_e32 v22, v0 ; NOOPT-NEXT: v_mov_b32_e32 v0, 0x41880000 -; NOOPT-NEXT: s_mov_b32 m0, s4 +; NOOPT-NEXT: s_mov_b32 m0, s7 ; NOOPT-NEXT: v_movreld_b32_e32 v7, v0 ; NOOPT-NEXT: v_mov_b32_e32 v4, v22 ; NOOPT-NEXT: v_mov_b32_e32 v5, v21 @@ -3101,7 +3214,14 @@ define amdgpu_kernel void @insert_wo_offset(ptr addrspace(1) %out, i32 %in) { ; NOOPT-NEXT: v_mov_b32_e32 v1, v6 ; NOOPT-NEXT: v_mov_b32_e32 v2, v5 ; NOOPT-NEXT: v_mov_b32_e32 v3, v4 -; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; NOOPT-NEXT: s_mov_b32 s8, 0 +; NOOPT-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9 +; NOOPT-NEXT: s_mov_b32 s9, s6 +; NOOPT-NEXT: ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7 +; NOOPT-NEXT: s_mov_b64 s[6:7], s[8:9] +; NOOPT-NEXT: v_mov_b32_e32 v4, 32 +; NOOPT-NEXT: v_mov_b32_e32 v5, 0 +; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64 offset:16 ; NOOPT-NEXT: v_mov_b32_e32 v4, v18 ; NOOPT-NEXT: v_mov_b32_e32 v5, v17 ; NOOPT-NEXT: v_mov_b32_e32 v6, v16 @@ -3365,49 +3485,55 @@ define amdgpu_kernel void @insert_neg_offset_sgpr(ptr addrspace(1) %in, ptr addr ; ; NOOPT-LABEL: insert_neg_offset_sgpr: ; NOOPT: ; %bb.0: ; %entry -; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb -; NOOPT-NEXT: s_load_dword s4, s[4:5], 0xd +; NOOPT-NEXT: s_mov_b64 s[0:1], s[4:5] +; NOOPT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; NOOPT-NEXT: s_waitcnt lgkmcnt(0) -; NOOPT-NEXT: s_mov_b32 s7, s1 -; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 -; NOOPT-NEXT: s_mov_b32 s5, 0xf000 -; NOOPT-NEXT: s_mov_b32 s6, -1 +; NOOPT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_load_dword s2, s[0:1], 0xd +; NOOPT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; NOOPT-NEXT: s_load_dword s7, s[0:1], 0xd +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_mov_b32 s9, s5 +; NOOPT-NEXT: s_mov_b32 s0, s4 +; NOOPT-NEXT: s_mov_b32 s6, 0xf000 +; NOOPT-NEXT: s_mov_b32 s8, -1 ; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 -; NOOPT-NEXT: s_mov_b32 s1, s7 -; NOOPT-NEXT: s_mov_b32 s2, s6 -; NOOPT-NEXT: s_mov_b32 s3, s5 -; NOOPT-NEXT: s_mov_b32 s5, 15 -; NOOPT-NEXT: s_mov_b32 s6, 14 -; NOOPT-NEXT: s_mov_b32 s7, 13 -; NOOPT-NEXT: s_mov_b32 s8, 12 -; NOOPT-NEXT: s_mov_b32 s9, 11 -; NOOPT-NEXT: s_mov_b32 s10, 10 -; NOOPT-NEXT: s_mov_b32 s11, 9 -; NOOPT-NEXT: s_mov_b32 s12, 8 -; NOOPT-NEXT: s_mov_b32 s13, 7 -; NOOPT-NEXT: s_mov_b32 s14, 6 -; NOOPT-NEXT: s_mov_b32 s15, 5 -; NOOPT-NEXT: s_mov_b32 s16, 4 -; NOOPT-NEXT: s_mov_b32 s17, 3 -; NOOPT-NEXT: s_mov_b32 s18, 2 -; NOOPT-NEXT: s_mov_b32 s19, 1 -; NOOPT-NEXT: s_mov_b32 s20, 0 -; NOOPT-NEXT: v_mov_b32_e32 v15, s20 -; NOOPT-NEXT: v_mov_b32_e32 v14, s19 -; NOOPT-NEXT: v_mov_b32_e32 v13, s18 -; NOOPT-NEXT: v_mov_b32_e32 v12, s17 -; NOOPT-NEXT: v_mov_b32_e32 v11, s16 -; NOOPT-NEXT: v_mov_b32_e32 v10, s15 -; NOOPT-NEXT: v_mov_b32_e32 v9, s14 -; NOOPT-NEXT: v_mov_b32_e32 v8, s13 -; NOOPT-NEXT: v_mov_b32_e32 v7, s12 -; NOOPT-NEXT: v_mov_b32_e32 v6, s11 -; NOOPT-NEXT: v_mov_b32_e32 v5, s10 -; NOOPT-NEXT: v_mov_b32_e32 v4, s9 -; NOOPT-NEXT: v_mov_b32_e32 v3, s8 -; NOOPT-NEXT: v_mov_b32_e32 v2, s7 -; NOOPT-NEXT: v_mov_b32_e32 v1, s6 -; NOOPT-NEXT: v_mov_b32_e32 v0, s5 +; NOOPT-NEXT: s_mov_b32 s1, s9 +; NOOPT-NEXT: s_mov_b32 s2, s8 +; NOOPT-NEXT: s_mov_b32 s3, s6 +; NOOPT-NEXT: s_mov_b32 s9, 15 +; NOOPT-NEXT: s_mov_b32 s10, 14 +; NOOPT-NEXT: s_mov_b32 s11, 13 +; NOOPT-NEXT: s_mov_b32 s12, 12 +; NOOPT-NEXT: s_mov_b32 s13, 11 +; NOOPT-NEXT: s_mov_b32 s14, 10 +; NOOPT-NEXT: s_mov_b32 s15, 9 +; NOOPT-NEXT: s_mov_b32 s16, 8 +; NOOPT-NEXT: s_mov_b32 s17, 7 +; NOOPT-NEXT: s_mov_b32 s18, 6 +; NOOPT-NEXT: s_mov_b32 s19, 5 +; NOOPT-NEXT: s_mov_b32 s20, 4 +; NOOPT-NEXT: s_mov_b32 s21, 3 +; NOOPT-NEXT: s_mov_b32 s22, 2 +; NOOPT-NEXT: s_mov_b32 s23, 1 +; NOOPT-NEXT: s_mov_b32 s8, 0 +; NOOPT-NEXT: v_mov_b32_e32 v15, s8 +; NOOPT-NEXT: v_mov_b32_e32 v14, s23 +; NOOPT-NEXT: v_mov_b32_e32 v13, s22 +; NOOPT-NEXT: v_mov_b32_e32 v12, s21 +; NOOPT-NEXT: v_mov_b32_e32 v11, s20 +; NOOPT-NEXT: v_mov_b32_e32 v10, s19 +; NOOPT-NEXT: v_mov_b32_e32 v9, s18 +; NOOPT-NEXT: v_mov_b32_e32 v8, s17 +; NOOPT-NEXT: v_mov_b32_e32 v7, s16 +; NOOPT-NEXT: v_mov_b32_e32 v6, s15 +; NOOPT-NEXT: v_mov_b32_e32 v5, s14 +; NOOPT-NEXT: v_mov_b32_e32 v4, s13 +; NOOPT-NEXT: v_mov_b32_e32 v3, s12 +; NOOPT-NEXT: v_mov_b32_e32 v2, s11 +; NOOPT-NEXT: v_mov_b32_e32 v1, s10 +; NOOPT-NEXT: v_mov_b32_e32 v0, s9 ; NOOPT-NEXT: ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30 killed $exec ; NOOPT-NEXT: v_mov_b32_e32 v16, v14 ; NOOPT-NEXT: v_mov_b32_e32 v17, v13 @@ -3425,7 +3551,7 @@ define amdgpu_kernel void @insert_neg_offset_sgpr(ptr addrspace(1) %in, ptr addr ; NOOPT-NEXT: v_mov_b32_e32 v29, v1 ; NOOPT-NEXT: v_mov_b32_e32 v30, v0 ; NOOPT-NEXT: v_mov_b32_e32 v0, 16 -; NOOPT-NEXT: s_add_i32 m0, s4, 0xfffffe00 +; NOOPT-NEXT: s_add_i32 m0, s7, 0xfffffe00 ; NOOPT-NEXT: v_movreld_b32_e32 v15, v0 ; NOOPT-NEXT: v_mov_b32_e32 v4, v18 ; NOOPT-NEXT: v_mov_b32_e32 v5, v17 @@ -3447,7 +3573,13 @@ define amdgpu_kernel void @insert_neg_offset_sgpr(ptr addrspace(1) %in, ptr addr ; NOOPT-NEXT: v_mov_b32_e32 v16, v11 ; NOOPT-NEXT: v_mov_b32_e32 v17, v10 ; NOOPT-NEXT: v_mov_b32_e32 v18, v9 -; NOOPT-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:48 +; NOOPT-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9 +; NOOPT-NEXT: s_mov_b32 s9, s6 +; NOOPT-NEXT: ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7 +; NOOPT-NEXT: s_mov_b64 s[6:7], s[8:9] +; NOOPT-NEXT: v_mov_b32_e32 v9, 32 +; NOOPT-NEXT: v_mov_b32_e32 v10, 0 +; NOOPT-NEXT: buffer_store_dwordx4 v[15:18], v[9:10], s[4:7], 0 addr64 offset:16 ; NOOPT-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9_vgpr10_vgpr11 killed $exec ; NOOPT-NEXT: v_mov_b32_e32 v9, v14 ; NOOPT-NEXT: v_mov_b32_e32 v10, v13 @@ -3714,18 +3846,25 @@ define amdgpu_kernel void @insert_neg_offset_sgpr_loadreg(ptr addrspace(1) %in, ; ; NOOPT-LABEL: insert_neg_offset_sgpr_loadreg: ; NOOPT: ; %bb.0: ; %entry -; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb -; NOOPT-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x19 -; NOOPT-NEXT: s_load_dword s4, s[4:5], 0x29 +; NOOPT-NEXT: s_mov_b64 s[0:1], s[4:5] +; NOOPT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; NOOPT-NEXT: s_waitcnt lgkmcnt(0) -; NOOPT-NEXT: s_mov_b32 s7, s1 -; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 -; NOOPT-NEXT: s_mov_b32 s5, 0xf000 -; NOOPT-NEXT: s_mov_b32 s6, -1 +; NOOPT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; NOOPT-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x19 +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_load_dword s2, s[0:1], 0x29 +; NOOPT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; NOOPT-NEXT: s_load_dwordx16 s[8:23], s[0:1], 0x19 +; NOOPT-NEXT: s_load_dword s7, s[0:1], 0x29 +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_mov_b32 s25, s5 +; NOOPT-NEXT: s_mov_b32 s0, s4 +; NOOPT-NEXT: s_mov_b32 s6, 0xf000 +; NOOPT-NEXT: s_mov_b32 s24, -1 ; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 -; NOOPT-NEXT: s_mov_b32 s1, s7 -; NOOPT-NEXT: s_mov_b32 s2, s6 -; NOOPT-NEXT: s_mov_b32 s3, s5 +; NOOPT-NEXT: s_mov_b32 s1, s25 +; NOOPT-NEXT: s_mov_b32 s2, s24 +; NOOPT-NEXT: s_mov_b32 s3, s6 ; NOOPT-NEXT: v_mov_b32_e32 v0, 5 ; NOOPT-NEXT: v_mov_b32_e32 v30, s23 ; NOOPT-NEXT: v_mov_b32_e32 v29, s22 @@ -3743,7 +3882,7 @@ define amdgpu_kernel void @insert_neg_offset_sgpr_loadreg(ptr addrspace(1) %in, ; NOOPT-NEXT: v_mov_b32_e32 v17, s10 ; NOOPT-NEXT: v_mov_b32_e32 v16, s9 ; NOOPT-NEXT: v_mov_b32_e32 v15, s8 -; NOOPT-NEXT: s_add_i32 m0, s4, 0xfffffe00 +; NOOPT-NEXT: s_add_i32 m0, s7, 0xfffffe00 ; NOOPT-NEXT: v_movreld_b32_e32 v15, v0 ; NOOPT-NEXT: v_mov_b32_e32 v4, v18 ; NOOPT-NEXT: v_mov_b32_e32 v5, v17 @@ -3765,7 +3904,14 @@ define amdgpu_kernel void @insert_neg_offset_sgpr_loadreg(ptr addrspace(1) %in, ; NOOPT-NEXT: v_mov_b32_e32 v16, v11 ; NOOPT-NEXT: v_mov_b32_e32 v17, v10 ; NOOPT-NEXT: v_mov_b32_e32 v18, v9 -; NOOPT-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:48 +; NOOPT-NEXT: s_mov_b32 s8, 0 +; NOOPT-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9 +; NOOPT-NEXT: s_mov_b32 s9, s6 +; NOOPT-NEXT: ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7 +; NOOPT-NEXT: s_mov_b64 s[6:7], s[8:9] +; NOOPT-NEXT: v_mov_b32_e32 v9, 32 +; NOOPT-NEXT: v_mov_b32_e32 v10, 0 +; NOOPT-NEXT: buffer_store_dwordx4 v[15:18], v[9:10], s[4:7], 0 addr64 offset:16 ; NOOPT-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9_vgpr10_vgpr11 killed $exec ; NOOPT-NEXT: v_mov_b32_e32 v9, v14 ; NOOPT-NEXT: v_mov_b32_e32 v10, v13 @@ -4006,21 +4152,27 @@ define amdgpu_kernel void @insert_neg_offset_vgpr(ptr addrspace(1) %in, ptr addr ; NOOPT-NEXT: s_add_u32 s20, s20, s11 ; NOOPT-NEXT: s_addc_u32 s21, s21, 0 ; NOOPT-NEXT: buffer_store_dword v0, off, s[20:23], 0 offset:136 ; 4-byte Folded Spill +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) ; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb ; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb +; NOOPT-NEXT: ; implicit-def: $vgpr31 : SGPR spill to VGPR lane +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: v_writelane_b32 v31, s0, 0 +; NOOPT-NEXT: v_writelane_b32 v31, s1, 1 ; NOOPT-NEXT: s_mov_b32 s6, s1 -; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 ; NOOPT-NEXT: s_mov_b32 s4, 0xf000 +; NOOPT-NEXT: v_writelane_b32 v31, s4, 2 ; NOOPT-NEXT: s_mov_b32 s5, -1 ; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 ; NOOPT-NEXT: s_mov_b32 s1, s6 ; NOOPT-NEXT: s_mov_b32 s2, s5 ; NOOPT-NEXT: s_mov_b32 s3, s4 -; NOOPT-NEXT: ; implicit-def: $vgpr31 : SGPR spill to VGPR lane -; NOOPT-NEXT: v_writelane_b32 v31, s0, 0 -; NOOPT-NEXT: v_writelane_b32 v31, s1, 1 -; NOOPT-NEXT: v_writelane_b32 v31, s2, 2 -; NOOPT-NEXT: v_writelane_b32 v31, s3, 3 +; NOOPT-NEXT: v_writelane_b32 v31, s0, 3 +; NOOPT-NEXT: v_writelane_b32 v31, s1, 4 +; NOOPT-NEXT: v_writelane_b32 v31, s2, 5 +; NOOPT-NEXT: v_writelane_b32 v31, s3, 6 ; NOOPT-NEXT: s_mov_b32 s0, 16 ; NOOPT-NEXT: s_mov_b32 s1, 15 ; NOOPT-NEXT: s_mov_b32 s2, 14 @@ -4089,8 +4241,8 @@ define amdgpu_kernel void @insert_neg_offset_vgpr(ptr addrspace(1) %in, ptr addr ; NOOPT-NEXT: v_mov_b32_e32 v16, 33 ; NOOPT-NEXT: buffer_store_dword v16, off, s[20:23], 0 offset:68 ; 4-byte Folded Spill ; NOOPT-NEXT: s_mov_b64 s[0:1], exec -; NOOPT-NEXT: v_writelane_b32 v31, s0, 4 -; NOOPT-NEXT: v_writelane_b32 v31, s1, 5 +; NOOPT-NEXT: v_writelane_b32 v31, s0, 7 +; NOOPT-NEXT: v_writelane_b32 v31, s1, 8 ; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1 ; NOOPT-NEXT: buffer_store_dword v31, off, s[20:23], 0 ; 4-byte Folded Spill ; NOOPT-NEXT: s_mov_b64 exec, s[16:17] @@ -4117,8 +4269,8 @@ define amdgpu_kernel void @insert_neg_offset_vgpr(ptr addrspace(1) %in, ptr addr ; NOOPT-NEXT: buffer_load_dword v31, off, s[20:23], 0 ; 4-byte Folded Reload ; NOOPT-NEXT: s_mov_b64 exec, s[16:17] ; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_readlane_b32 s0, v31, 6 -; NOOPT-NEXT: v_readlane_b32 s1, v31, 7 +; NOOPT-NEXT: v_readlane_b32 s0, v31, 9 +; NOOPT-NEXT: v_readlane_b32 s1, v31, 10 ; NOOPT-NEXT: buffer_load_dword v0, off, s[20:23], 0 offset:4 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v1, off, s[20:23], 0 offset:8 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v2, off, s[20:23], 0 offset:12 ; 4-byte Folded Reload @@ -4183,8 +4335,8 @@ define amdgpu_kernel void @insert_neg_offset_vgpr(ptr addrspace(1) %in, ptr addr ; NOOPT-NEXT: buffer_store_dword v14, off, s[20:23], 0 offset:60 ; 4-byte Folded Spill ; NOOPT-NEXT: buffer_store_dword v15, off, s[20:23], 0 offset:64 ; 4-byte Folded Spill ; NOOPT-NEXT: s_mov_b64 s[2:3], s[0:1] -; NOOPT-NEXT: v_writelane_b32 v31, s2, 6 -; NOOPT-NEXT: v_writelane_b32 v31, s3, 7 +; NOOPT-NEXT: v_writelane_b32 v31, s2, 9 +; NOOPT-NEXT: v_writelane_b32 v31, s3, 10 ; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1 ; NOOPT-NEXT: buffer_store_dword v31, off, s[20:23], 0 ; 4-byte Folded Spill ; NOOPT-NEXT: s_mov_b64 exec, s[16:17] @@ -4196,18 +4348,21 @@ define amdgpu_kernel void @insert_neg_offset_vgpr(ptr addrspace(1) %in, ptr addr ; NOOPT-NEXT: buffer_load_dword v31, off, s[20:23], 0 ; 4-byte Folded Reload ; NOOPT-NEXT: s_mov_b64 exec, s[16:17] ; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_readlane_b32 s0, v31, 4 -; NOOPT-NEXT: v_readlane_b32 s1, v31, 5 +; NOOPT-NEXT: v_readlane_b32 s0, v31, 7 +; NOOPT-NEXT: v_readlane_b32 s1, v31, 8 ; NOOPT-NEXT: s_mov_b64 exec, s[0:1] ; NOOPT-NEXT: ; %bb.3: ; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1 ; NOOPT-NEXT: buffer_load_dword v31, off, s[20:23], 0 ; 4-byte Folded Reload ; NOOPT-NEXT: s_mov_b64 exec, s[16:17] ; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_readlane_b32 s0, v31, 0 -; NOOPT-NEXT: v_readlane_b32 s1, v31, 1 -; NOOPT-NEXT: v_readlane_b32 s2, v31, 2 -; NOOPT-NEXT: v_readlane_b32 s3, v31, 3 +; NOOPT-NEXT: v_readlane_b32 s0, v31, 3 +; NOOPT-NEXT: v_readlane_b32 s1, v31, 4 +; NOOPT-NEXT: v_readlane_b32 s2, v31, 5 +; NOOPT-NEXT: v_readlane_b32 s3, v31, 6 +; NOOPT-NEXT: v_readlane_b32 s4, v31, 0 +; NOOPT-NEXT: v_readlane_b32 s5, v31, 1 +; NOOPT-NEXT: v_readlane_b32 s6, v31, 2 ; NOOPT-NEXT: buffer_load_dword v15, off, s[20:23], 0 offset:140 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v16, off, s[20:23], 0 offset:144 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v17, off, s[20:23], 0 offset:148 ; 4-byte Folded Reload @@ -4248,7 +4403,14 @@ define amdgpu_kernel void @insert_neg_offset_vgpr(ptr addrspace(1) %in, ptr addr ; NOOPT-NEXT: v_mov_b32_e32 v16, v11 ; NOOPT-NEXT: v_mov_b32_e32 v17, v10 ; NOOPT-NEXT: v_mov_b32_e32 v18, v9 -; NOOPT-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:48 +; NOOPT-NEXT: s_mov_b32 s8, 0 +; NOOPT-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9 +; NOOPT-NEXT: s_mov_b32 s9, s6 +; NOOPT-NEXT: ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7 +; NOOPT-NEXT: s_mov_b64 s[6:7], s[8:9] +; NOOPT-NEXT: v_mov_b32_e32 v9, 32 +; NOOPT-NEXT: v_mov_b32_e32 v10, 0 +; NOOPT-NEXT: buffer_store_dwordx4 v[15:18], v[9:10], s[4:7], 0 addr64 offset:16 ; NOOPT-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9_vgpr10_vgpr11 killed $exec ; NOOPT-NEXT: v_mov_b32_e32 v9, v14 ; NOOPT-NEXT: v_mov_b32_e32 v10, v13 @@ -4479,21 +4641,27 @@ define amdgpu_kernel void @insert_neg_inline_offset_vgpr(ptr addrspace(1) %in, p ; NOOPT-NEXT: s_add_u32 s20, s20, s11 ; NOOPT-NEXT: s_addc_u32 s21, s21, 0 ; NOOPT-NEXT: buffer_store_dword v0, off, s[20:23], 0 offset:136 ; 4-byte Folded Spill +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) ; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb ; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb +; NOOPT-NEXT: ; implicit-def: $vgpr31 : SGPR spill to VGPR lane +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: v_writelane_b32 v31, s0, 0 +; NOOPT-NEXT: v_writelane_b32 v31, s1, 1 ; NOOPT-NEXT: s_mov_b32 s6, s1 -; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 ; NOOPT-NEXT: s_mov_b32 s4, 0xf000 +; NOOPT-NEXT: v_writelane_b32 v31, s4, 2 ; NOOPT-NEXT: s_mov_b32 s5, -1 ; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 ; NOOPT-NEXT: s_mov_b32 s1, s6 ; NOOPT-NEXT: s_mov_b32 s2, s5 ; NOOPT-NEXT: s_mov_b32 s3, s4 -; NOOPT-NEXT: ; implicit-def: $vgpr31 : SGPR spill to VGPR lane -; NOOPT-NEXT: v_writelane_b32 v31, s0, 0 -; NOOPT-NEXT: v_writelane_b32 v31, s1, 1 -; NOOPT-NEXT: v_writelane_b32 v31, s2, 2 -; NOOPT-NEXT: v_writelane_b32 v31, s3, 3 +; NOOPT-NEXT: v_writelane_b32 v31, s0, 3 +; NOOPT-NEXT: v_writelane_b32 v31, s1, 4 +; NOOPT-NEXT: v_writelane_b32 v31, s2, 5 +; NOOPT-NEXT: v_writelane_b32 v31, s3, 6 ; NOOPT-NEXT: s_mov_b32 s0, 16 ; NOOPT-NEXT: s_mov_b32 s1, 15 ; NOOPT-NEXT: s_mov_b32 s2, 14 @@ -4562,8 +4730,8 @@ define amdgpu_kernel void @insert_neg_inline_offset_vgpr(ptr addrspace(1) %in, p ; NOOPT-NEXT: v_mov_b32_e32 v16, 0x1f4 ; NOOPT-NEXT: buffer_store_dword v16, off, s[20:23], 0 offset:68 ; 4-byte Folded Spill ; NOOPT-NEXT: s_mov_b64 s[0:1], exec -; NOOPT-NEXT: v_writelane_b32 v31, s0, 4 -; NOOPT-NEXT: v_writelane_b32 v31, s1, 5 +; NOOPT-NEXT: v_writelane_b32 v31, s0, 7 +; NOOPT-NEXT: v_writelane_b32 v31, s1, 8 ; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1 ; NOOPT-NEXT: buffer_store_dword v31, off, s[20:23], 0 ; 4-byte Folded Spill ; NOOPT-NEXT: s_mov_b64 exec, s[16:17] @@ -4590,8 +4758,8 @@ define amdgpu_kernel void @insert_neg_inline_offset_vgpr(ptr addrspace(1) %in, p ; NOOPT-NEXT: buffer_load_dword v31, off, s[20:23], 0 ; 4-byte Folded Reload ; NOOPT-NEXT: s_mov_b64 exec, s[16:17] ; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_readlane_b32 s0, v31, 6 -; NOOPT-NEXT: v_readlane_b32 s1, v31, 7 +; NOOPT-NEXT: v_readlane_b32 s0, v31, 9 +; NOOPT-NEXT: v_readlane_b32 s1, v31, 10 ; NOOPT-NEXT: buffer_load_dword v0, off, s[20:23], 0 offset:4 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v1, off, s[20:23], 0 offset:8 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v2, off, s[20:23], 0 offset:12 ; 4-byte Folded Reload @@ -4656,8 +4824,8 @@ define amdgpu_kernel void @insert_neg_inline_offset_vgpr(ptr addrspace(1) %in, p ; NOOPT-NEXT: buffer_store_dword v14, off, s[20:23], 0 offset:60 ; 4-byte Folded Spill ; NOOPT-NEXT: buffer_store_dword v15, off, s[20:23], 0 offset:64 ; 4-byte Folded Spill ; NOOPT-NEXT: s_mov_b64 s[2:3], s[0:1] -; NOOPT-NEXT: v_writelane_b32 v31, s2, 6 -; NOOPT-NEXT: v_writelane_b32 v31, s3, 7 +; NOOPT-NEXT: v_writelane_b32 v31, s2, 9 +; NOOPT-NEXT: v_writelane_b32 v31, s3, 10 ; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1 ; NOOPT-NEXT: buffer_store_dword v31, off, s[20:23], 0 ; 4-byte Folded Spill ; NOOPT-NEXT: s_mov_b64 exec, s[16:17] @@ -4669,18 +4837,21 @@ define amdgpu_kernel void @insert_neg_inline_offset_vgpr(ptr addrspace(1) %in, p ; NOOPT-NEXT: buffer_load_dword v31, off, s[20:23], 0 ; 4-byte Folded Reload ; NOOPT-NEXT: s_mov_b64 exec, s[16:17] ; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_readlane_b32 s0, v31, 4 -; NOOPT-NEXT: v_readlane_b32 s1, v31, 5 +; NOOPT-NEXT: v_readlane_b32 s0, v31, 7 +; NOOPT-NEXT: v_readlane_b32 s1, v31, 8 ; NOOPT-NEXT: s_mov_b64 exec, s[0:1] ; NOOPT-NEXT: ; %bb.3: ; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1 ; NOOPT-NEXT: buffer_load_dword v31, off, s[20:23], 0 ; 4-byte Folded Reload ; NOOPT-NEXT: s_mov_b64 exec, s[16:17] ; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_readlane_b32 s0, v31, 0 -; NOOPT-NEXT: v_readlane_b32 s1, v31, 1 -; NOOPT-NEXT: v_readlane_b32 s2, v31, 2 -; NOOPT-NEXT: v_readlane_b32 s3, v31, 3 +; NOOPT-NEXT: v_readlane_b32 s0, v31, 3 +; NOOPT-NEXT: v_readlane_b32 s1, v31, 4 +; NOOPT-NEXT: v_readlane_b32 s2, v31, 5 +; NOOPT-NEXT: v_readlane_b32 s3, v31, 6 +; NOOPT-NEXT: v_readlane_b32 s4, v31, 0 +; NOOPT-NEXT: v_readlane_b32 s5, v31, 1 +; NOOPT-NEXT: v_readlane_b32 s6, v31, 2 ; NOOPT-NEXT: buffer_load_dword v15, off, s[20:23], 0 offset:140 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v16, off, s[20:23], 0 offset:144 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v17, off, s[20:23], 0 offset:148 ; 4-byte Folded Reload @@ -4721,7 +4892,14 @@ define amdgpu_kernel void @insert_neg_inline_offset_vgpr(ptr addrspace(1) %in, p ; NOOPT-NEXT: v_mov_b32_e32 v16, v11 ; NOOPT-NEXT: v_mov_b32_e32 v17, v10 ; NOOPT-NEXT: v_mov_b32_e32 v18, v9 -; NOOPT-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:48 +; NOOPT-NEXT: s_mov_b32 s8, 0 +; NOOPT-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9 +; NOOPT-NEXT: s_mov_b32 s9, s6 +; NOOPT-NEXT: ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7 +; NOOPT-NEXT: s_mov_b64 s[6:7], s[8:9] +; NOOPT-NEXT: v_mov_b32_e32 v9, 32 +; NOOPT-NEXT: v_mov_b32_e32 v10, 0 +; NOOPT-NEXT: buffer_store_dwordx4 v[15:18], v[9:10], s[4:7], 0 addr64 offset:16 ; NOOPT-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9_vgpr10_vgpr11 killed $exec ; NOOPT-NEXT: v_mov_b32_e32 v9, v14 ; NOOPT-NEXT: v_mov_b32_e32 v10, v13 @@ -5003,9 +5181,14 @@ define amdgpu_kernel void @extract_vgpr_offset_multiple_in_block(ptr addrspace(1 ; NOOPT-NEXT: s_add_u32 s36, s36, s11 ; NOOPT-NEXT: s_addc_u32 s37, s37, 0 ; NOOPT-NEXT: buffer_store_dword v0, off, s[36:39], 0 offset:76 ; 4-byte Folded Spill -; NOOPT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x9 -; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; NOOPT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x9 +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; NOOPT-NEXT: s_mov_b32 s8, s3 ; NOOPT-NEXT: s_mov_b32 s4, s2 ; NOOPT-NEXT: s_mov_b32 s2, 0xf000 @@ -5019,18 +5202,19 @@ define amdgpu_kernel void @extract_vgpr_offset_multiple_in_block(ptr addrspace(1 ; NOOPT-NEXT: v_writelane_b32 v18, s5, 1 ; NOOPT-NEXT: v_writelane_b32 v18, s6, 2 ; NOOPT-NEXT: v_writelane_b32 v18, s7, 3 +; NOOPT-NEXT: v_mov_b32_e32 v2, 0 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v2 +; NOOPT-NEXT: s_mov_b32 s3, 2 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: v_lshl_b64 v[0:1], v[0:1], s3 ; NOOPT-NEXT: s_mov_b32 s4, 0 ; NOOPT-NEXT: v_writelane_b32 v18, s4, 4 ; NOOPT-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; NOOPT-NEXT: s_mov_b32 s5, s2 ; NOOPT-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3 ; NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] -; NOOPT-NEXT: s_mov_b32 s4, 2 -; NOOPT-NEXT: s_waitcnt expcnt(0) -; NOOPT-NEXT: v_lshlrev_b32_e64 v0, s4, v0 -; NOOPT-NEXT: v_mov_b32_e32 v2, 0 -; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec -; NOOPT-NEXT: v_mov_b32_e32 v1, v2 +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) ; NOOPT-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc ; NOOPT-NEXT: s_waitcnt vmcnt(0) ; NOOPT-NEXT: buffer_store_dword v0, off, s[36:39], 0 offset:72 ; 4-byte Folded Spill @@ -5768,10 +5952,20 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) ; NOOPT-NEXT: s_add_u32 s28, s28, s11 ; NOOPT-NEXT: s_addc_u32 s29, s29, 0 ; NOOPT-NEXT: buffer_store_dword v0, off, s[28:31], 0 offset:84 ; 4-byte Folded Spill +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; NOOPT-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x19 +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) ; NOOPT-NEXT: s_load_dwordx2 s[18:19], s[4:5], 0x9 +; NOOPT-NEXT: ; implicit-def: $vgpr32 : SGPR spill to VGPR lane +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: v_writelane_b32 v32, s18, 0 +; NOOPT-NEXT: v_writelane_b32 v32, s19, 1 ; NOOPT-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0xd ; NOOPT-NEXT: s_load_dwordx16 s[0:15], s[4:5], 0x19 -; NOOPT-NEXT: s_waitcnt lgkmcnt(0) ; NOOPT-NEXT: s_mov_b32 s24, s19 ; NOOPT-NEXT: s_mov_b32 s20, s18 ; NOOPT-NEXT: s_mov_b32 s18, 0xf000 @@ -5780,23 +5974,25 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) ; NOOPT-NEXT: s_mov_b32 s21, s24 ; NOOPT-NEXT: s_mov_b32 s22, s19 ; NOOPT-NEXT: s_mov_b32 s23, s18 -; NOOPT-NEXT: ; implicit-def: $vgpr32 : SGPR spill to VGPR lane -; NOOPT-NEXT: v_writelane_b32 v32, s20, 0 -; NOOPT-NEXT: v_writelane_b32 v32, s21, 1 -; NOOPT-NEXT: v_writelane_b32 v32, s22, 2 -; NOOPT-NEXT: v_writelane_b32 v32, s23, 3 -; NOOPT-NEXT: s_mov_b32 s20, 0 -; NOOPT-NEXT: v_writelane_b32 v32, s20, 4 -; NOOPT-NEXT: ; kill: def $sgpr20 killed $sgpr20 def $sgpr20_sgpr21 -; NOOPT-NEXT: s_mov_b32 s21, s18 -; NOOPT-NEXT: ; kill: def $sgpr16_sgpr17 killed $sgpr16_sgpr17 def $sgpr16_sgpr17_sgpr18_sgpr19 -; NOOPT-NEXT: s_mov_b64 s[18:19], s[20:21] -; NOOPT-NEXT: s_mov_b32 s20, 2 -; NOOPT-NEXT: s_waitcnt expcnt(0) -; NOOPT-NEXT: v_lshlrev_b32_e64 v0, s20, v0 +; NOOPT-NEXT: v_writelane_b32 v32, s20, 2 +; NOOPT-NEXT: v_writelane_b32 v32, s21, 3 +; NOOPT-NEXT: v_writelane_b32 v32, s22, 4 +; NOOPT-NEXT: v_writelane_b32 v32, s23, 5 ; NOOPT-NEXT: v_mov_b32_e32 v2, 0 ; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; NOOPT-NEXT: v_mov_b32_e32 v1, v2 +; NOOPT-NEXT: s_mov_b32 s19, 2 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: v_lshl_b64 v[0:1], v[0:1], s19 +; NOOPT-NEXT: s_mov_b32 s20, 0 +; NOOPT-NEXT: v_writelane_b32 v32, s20, 6 +; NOOPT-NEXT: ; kill: def $sgpr20 killed $sgpr20 def $sgpr20_sgpr21 +; NOOPT-NEXT: s_mov_b32 s21, s18 +; NOOPT-NEXT: v_writelane_b32 v32, s20, 7 +; NOOPT-NEXT: v_writelane_b32 v32, s21, 8 +; NOOPT-NEXT: ; kill: def $sgpr16_sgpr17 killed $sgpr16_sgpr17 def $sgpr16_sgpr17_sgpr18_sgpr19 +; NOOPT-NEXT: s_mov_b64 s[18:19], s[20:21] +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) ; NOOPT-NEXT: buffer_load_dword v0, v[0:1], s[16:19], 0 addr64 glc ; NOOPT-NEXT: s_waitcnt vmcnt(0) ; NOOPT-NEXT: buffer_store_dword v0, off, s[28:31], 0 offset:80 ; 4-byte Folded Spill @@ -5828,8 +6024,8 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) ; NOOPT-NEXT: v_mov_b32_e32 v14, s14 ; NOOPT-NEXT: v_mov_b32_e32 v15, s15 ; NOOPT-NEXT: s_mov_b64 s[0:1], exec -; NOOPT-NEXT: v_writelane_b32 v32, s0, 5 -; NOOPT-NEXT: v_writelane_b32 v32, s1, 6 +; NOOPT-NEXT: v_writelane_b32 v32, s0, 9 +; NOOPT-NEXT: v_writelane_b32 v32, s1, 10 ; NOOPT-NEXT: s_or_saveexec_b64 s[26:27], -1 ; NOOPT-NEXT: buffer_store_dword v32, off, s[28:31], 0 ; 4-byte Folded Spill ; NOOPT-NEXT: s_mov_b64 exec, s[26:27] @@ -5856,8 +6052,8 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) ; NOOPT-NEXT: buffer_load_dword v32, off, s[28:31], 0 ; 4-byte Folded Reload ; NOOPT-NEXT: s_mov_b64 exec, s[26:27] ; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_readlane_b32 s0, v32, 7 -; NOOPT-NEXT: v_readlane_b32 s1, v32, 8 +; NOOPT-NEXT: v_readlane_b32 s0, v32, 11 +; NOOPT-NEXT: v_readlane_b32 s1, v32, 12 ; NOOPT-NEXT: buffer_load_dword v0, off, s[28:31], 0 offset:4 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v1, off, s[28:31], 0 offset:8 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v2, off, s[28:31], 0 offset:12 ; 4-byte Folded Reload @@ -5922,8 +6118,8 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) ; NOOPT-NEXT: buffer_store_dword v14, off, s[28:31], 0 offset:60 ; 4-byte Folded Spill ; NOOPT-NEXT: buffer_store_dword v15, off, s[28:31], 0 offset:64 ; 4-byte Folded Spill ; NOOPT-NEXT: s_mov_b64 s[2:3], s[0:1] -; NOOPT-NEXT: v_writelane_b32 v32, s2, 7 -; NOOPT-NEXT: v_writelane_b32 v32, s3, 8 +; NOOPT-NEXT: v_writelane_b32 v32, s2, 11 +; NOOPT-NEXT: v_writelane_b32 v32, s3, 12 ; NOOPT-NEXT: s_or_saveexec_b64 s[26:27], -1 ; NOOPT-NEXT: buffer_store_dword v32, off, s[28:31], 0 ; 4-byte Folded Spill ; NOOPT-NEXT: s_mov_b64 exec, s[26:27] @@ -5935,8 +6131,8 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) ; NOOPT-NEXT: buffer_load_dword v32, off, s[28:31], 0 ; 4-byte Folded Reload ; NOOPT-NEXT: s_mov_b64 exec, s[26:27] ; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_readlane_b32 s0, v32, 5 -; NOOPT-NEXT: v_readlane_b32 s1, v32, 6 +; NOOPT-NEXT: v_readlane_b32 s0, v32, 9 +; NOOPT-NEXT: v_readlane_b32 s1, v32, 10 ; NOOPT-NEXT: s_mov_b64 exec, s[0:1] ; NOOPT-NEXT: ; %bb.3: ; NOOPT-NEXT: s_or_saveexec_b64 s[26:27], -1 @@ -5962,8 +6158,8 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) ; NOOPT-NEXT: buffer_store_dword v16, off, s[28:31], 0 offset:216 ; 4-byte Folded Spill ; NOOPT-NEXT: s_mov_b64 s[0:1], exec ; NOOPT-NEXT: s_waitcnt vmcnt(14) -; NOOPT-NEXT: v_writelane_b32 v32, s0, 9 -; NOOPT-NEXT: v_writelane_b32 v32, s1, 10 +; NOOPT-NEXT: v_writelane_b32 v32, s0, 13 +; NOOPT-NEXT: v_writelane_b32 v32, s1, 14 ; NOOPT-NEXT: s_or_saveexec_b64 s[26:27], -1 ; NOOPT-NEXT: buffer_store_dword v32, off, s[28:31], 0 ; 4-byte Folded Spill ; NOOPT-NEXT: s_mov_b64 exec, s[26:27] @@ -5994,8 +6190,8 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) ; NOOPT-NEXT: buffer_load_dword v32, off, s[28:31], 0 ; 4-byte Folded Reload ; NOOPT-NEXT: s_mov_b64 exec, s[26:27] ; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_readlane_b32 s0, v32, 11 -; NOOPT-NEXT: v_readlane_b32 s1, v32, 12 +; NOOPT-NEXT: v_readlane_b32 s0, v32, 15 +; NOOPT-NEXT: v_readlane_b32 s1, v32, 16 ; NOOPT-NEXT: buffer_load_dword v0, off, s[28:31], 0 offset:152 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v1, off, s[28:31], 0 offset:156 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v2, off, s[28:31], 0 offset:160 ; 4-byte Folded Reload @@ -6060,8 +6256,8 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) ; NOOPT-NEXT: buffer_store_dword v14, off, s[28:31], 0 offset:208 ; 4-byte Folded Spill ; NOOPT-NEXT: buffer_store_dword v15, off, s[28:31], 0 offset:212 ; 4-byte Folded Spill ; NOOPT-NEXT: s_mov_b64 s[2:3], s[0:1] -; NOOPT-NEXT: v_writelane_b32 v32, s2, 11 -; NOOPT-NEXT: v_writelane_b32 v32, s3, 12 +; NOOPT-NEXT: v_writelane_b32 v32, s2, 15 +; NOOPT-NEXT: v_writelane_b32 v32, s3, 16 ; NOOPT-NEXT: s_or_saveexec_b64 s[26:27], -1 ; NOOPT-NEXT: buffer_store_dword v32, off, s[28:31], 0 ; 4-byte Folded Spill ; NOOPT-NEXT: s_mov_b64 exec, s[26:27] @@ -6073,19 +6269,23 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) ; NOOPT-NEXT: buffer_load_dword v32, off, s[28:31], 0 ; 4-byte Folded Reload ; NOOPT-NEXT: s_mov_b64 exec, s[26:27] ; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_readlane_b32 s0, v32, 9 -; NOOPT-NEXT: v_readlane_b32 s1, v32, 10 +; NOOPT-NEXT: v_readlane_b32 s0, v32, 13 +; NOOPT-NEXT: v_readlane_b32 s1, v32, 14 ; NOOPT-NEXT: s_mov_b64 exec, s[0:1] ; NOOPT-NEXT: ; %bb.6: ; NOOPT-NEXT: s_or_saveexec_b64 s[26:27], -1 ; NOOPT-NEXT: buffer_load_dword v32, off, s[28:31], 0 ; 4-byte Folded Reload ; NOOPT-NEXT: s_mov_b64 exec, s[26:27] ; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_readlane_b32 s0, v32, 4 -; NOOPT-NEXT: v_readlane_b32 s4, v32, 0 -; NOOPT-NEXT: v_readlane_b32 s5, v32, 1 -; NOOPT-NEXT: v_readlane_b32 s6, v32, 2 -; NOOPT-NEXT: v_readlane_b32 s7, v32, 3 +; NOOPT-NEXT: v_readlane_b32 s0, v32, 6 +; NOOPT-NEXT: v_readlane_b32 s4, v32, 2 +; NOOPT-NEXT: v_readlane_b32 s5, v32, 3 +; NOOPT-NEXT: v_readlane_b32 s6, v32, 4 +; NOOPT-NEXT: v_readlane_b32 s7, v32, 5 +; NOOPT-NEXT: v_readlane_b32 s2, v32, 7 +; NOOPT-NEXT: v_readlane_b32 s3, v32, 8 +; NOOPT-NEXT: v_readlane_b32 s8, v32, 0 +; NOOPT-NEXT: v_readlane_b32 s9, v32, 1 ; NOOPT-NEXT: buffer_load_dword v0, off, s[28:31], 0 offset:84 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v16, off, s[28:31], 0 offset:220 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v17, off, s[28:31], 0 offset:224 ; 4-byte Folded Reload @@ -6127,7 +6327,11 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) ; NOOPT-NEXT: v_mov_b32_e32 v17, v12 ; NOOPT-NEXT: v_mov_b32_e32 v18, v11 ; NOOPT-NEXT: v_mov_b32_e32 v19, v10 -; NOOPT-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 offset:48 +; NOOPT-NEXT: ; kill: def $sgpr8_sgpr9 killed $sgpr8_sgpr9 def $sgpr8_sgpr9_sgpr10_sgpr11 +; NOOPT-NEXT: s_mov_b64 s[10:11], s[2:3] +; NOOPT-NEXT: v_mov_b32_e32 v10, 32 +; NOOPT-NEXT: v_mov_b32_e32 v11, 0 +; NOOPT-NEXT: buffer_store_dwordx4 v[16:19], v[10:11], s[8:11], 0 addr64 offset:16 ; NOOPT-NEXT: s_waitcnt vmcnt(0) ; NOOPT-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10_vgpr11_vgpr12 killed $exec ; NOOPT-NEXT: v_mov_b32_e32 v10, v15 @@ -6149,8 +6353,8 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) ; NOOPT-NEXT: s_waitcnt vmcnt(0) ; NOOPT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s0 ; NOOPT-NEXT: s_mov_b64 s[0:1], exec -; NOOPT-NEXT: v_writelane_b32 v32, s0, 13 -; NOOPT-NEXT: v_writelane_b32 v32, s1, 14 +; NOOPT-NEXT: v_writelane_b32 v32, s0, 17 +; NOOPT-NEXT: v_writelane_b32 v32, s1, 18 ; NOOPT-NEXT: s_or_saveexec_b64 s[26:27], -1 ; NOOPT-NEXT: buffer_store_dword v32, off, s[28:31], 0 ; 4-byte Folded Spill ; NOOPT-NEXT: s_mov_b64 exec, s[26:27] @@ -6178,8 +6382,8 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) ; NOOPT-NEXT: buffer_load_dword v32, off, s[28:31], 0 ; 4-byte Folded Reload ; NOOPT-NEXT: s_mov_b64 exec, s[26:27] ; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_readlane_b32 s0, v32, 13 -; NOOPT-NEXT: v_readlane_b32 s1, v32, 14 +; NOOPT-NEXT: v_readlane_b32 s0, v32, 17 +; NOOPT-NEXT: v_readlane_b32 s1, v32, 18 ; NOOPT-NEXT: s_or_b64 exec, exec, s[0:1] ; NOOPT-NEXT: s_endpgm ; @@ -6698,138 +6902,159 @@ define amdgpu_kernel void @insert_w_offset_multiple_in_block(ptr addrspace(1) %o ; NOOPT-LABEL: insert_w_offset_multiple_in_block: ; NOOPT: ; %bb.0: ; %entry ; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; NOOPT-NEXT: s_load_dword s4, s[4:5], 0xb ; NOOPT-NEXT: s_waitcnt lgkmcnt(0) -; NOOPT-NEXT: s_mov_b32 s7, s1 -; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 -; NOOPT-NEXT: s_mov_b32 s5, 0xf000 +; NOOPT-NEXT: s_load_dword s0, s[4:5], 0xb +; NOOPT-NEXT: s_load_dwordx2 s[14:15], s[4:5], 0x9 +; NOOPT-NEXT: s_load_dword s5, s[4:5], 0xb +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_mov_b32 s7, s15 +; NOOPT-NEXT: s_mov_b32 s0, s14 +; NOOPT-NEXT: s_mov_b32 s4, 0xf000 ; NOOPT-NEXT: s_mov_b32 s6, -1 ; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 ; NOOPT-NEXT: s_mov_b32 s1, s7 ; NOOPT-NEXT: s_mov_b32 s2, s6 -; NOOPT-NEXT: s_mov_b32 s3, s5 -; NOOPT-NEXT: s_mov_b32 s5, 1 -; NOOPT-NEXT: s_add_i32 s5, s4, s5 -; NOOPT-NEXT: s_mov_b32 s6, 0x41800000 -; NOOPT-NEXT: s_mov_b32 s7, 0x41700000 -; NOOPT-NEXT: s_mov_b32 s8, 0x41600000 -; NOOPT-NEXT: s_mov_b32 s9, 0x41500000 -; NOOPT-NEXT: s_mov_b32 s10, 0x41400000 -; NOOPT-NEXT: s_mov_b32 s11, 0x41300000 -; NOOPT-NEXT: s_mov_b32 s12, 0x41200000 -; NOOPT-NEXT: s_mov_b32 s13, 0x41100000 -; NOOPT-NEXT: s_mov_b32 s14, 0x41000000 -; NOOPT-NEXT: s_mov_b32 s15, 0x40e00000 -; NOOPT-NEXT: s_mov_b32 s16, 0x40c00000 -; NOOPT-NEXT: s_mov_b32 s17, 0x40a00000 -; NOOPT-NEXT: s_mov_b32 s18, 4.0 -; NOOPT-NEXT: s_mov_b32 s19, 0x40400000 -; NOOPT-NEXT: s_mov_b32 s20, 2.0 -; NOOPT-NEXT: s_mov_b32 s21, 1.0 -; NOOPT-NEXT: v_mov_b32_e32 v23, s21 -; NOOPT-NEXT: v_mov_b32_e32 v14, s20 -; NOOPT-NEXT: v_mov_b32_e32 v13, s19 -; NOOPT-NEXT: v_mov_b32_e32 v12, s18 -; NOOPT-NEXT: v_mov_b32_e32 v11, s17 -; NOOPT-NEXT: v_mov_b32_e32 v10, s16 -; NOOPT-NEXT: v_mov_b32_e32 v9, s15 -; NOOPT-NEXT: v_mov_b32_e32 v8, s14 -; NOOPT-NEXT: v_mov_b32_e32 v7, s13 -; NOOPT-NEXT: v_mov_b32_e32 v6, s12 -; NOOPT-NEXT: v_mov_b32_e32 v5, s11 -; NOOPT-NEXT: v_mov_b32_e32 v4, s10 -; NOOPT-NEXT: v_mov_b32_e32 v3, s9 -; NOOPT-NEXT: v_mov_b32_e32 v2, s8 -; NOOPT-NEXT: v_mov_b32_e32 v1, s7 -; NOOPT-NEXT: v_mov_b32_e32 v0, s6 -; NOOPT-NEXT: ; kill: def $vgpr23 killed $vgpr23 def $vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38 killed $exec -; NOOPT-NEXT: v_mov_b32_e32 v24, v14 -; NOOPT-NEXT: v_mov_b32_e32 v25, v13 -; NOOPT-NEXT: v_mov_b32_e32 v26, v12 -; NOOPT-NEXT: v_mov_b32_e32 v27, v11 -; NOOPT-NEXT: v_mov_b32_e32 v28, v10 -; NOOPT-NEXT: v_mov_b32_e32 v29, v9 -; NOOPT-NEXT: v_mov_b32_e32 v30, v8 -; NOOPT-NEXT: v_mov_b32_e32 v31, v7 -; NOOPT-NEXT: v_mov_b32_e32 v32, v6 -; NOOPT-NEXT: v_mov_b32_e32 v33, v5 -; NOOPT-NEXT: v_mov_b32_e32 v34, v4 -; NOOPT-NEXT: v_mov_b32_e32 v35, v3 -; NOOPT-NEXT: v_mov_b32_e32 v36, v2 -; NOOPT-NEXT: v_mov_b32_e32 v37, v1 -; NOOPT-NEXT: v_mov_b32_e32 v38, v0 +; NOOPT-NEXT: s_mov_b32 s3, s4 +; NOOPT-NEXT: s_mov_b32 s6, 1 +; NOOPT-NEXT: s_add_i32 s6, s5, s6 +; NOOPT-NEXT: s_mov_b32 s7, 0x41800000 +; NOOPT-NEXT: s_mov_b32 s8, 0x41700000 +; NOOPT-NEXT: s_mov_b32 s9, 0x41600000 +; NOOPT-NEXT: s_mov_b32 s10, 0x41500000 +; NOOPT-NEXT: s_mov_b32 s11, 0x41400000 +; NOOPT-NEXT: s_mov_b32 s12, 0x41300000 +; NOOPT-NEXT: s_mov_b32 s13, 0x41200000 +; NOOPT-NEXT: s_mov_b32 s16, 0x41100000 +; NOOPT-NEXT: s_mov_b32 s17, 0x41000000 +; NOOPT-NEXT: s_mov_b32 s18, 0x40e00000 +; NOOPT-NEXT: s_mov_b32 s19, 0x40c00000 +; NOOPT-NEXT: s_mov_b32 s20, 0x40a00000 +; NOOPT-NEXT: s_mov_b32 s21, 4.0 +; NOOPT-NEXT: s_mov_b32 s22, 0x40400000 +; NOOPT-NEXT: s_mov_b32 s23, 2.0 +; NOOPT-NEXT: s_mov_b32 s24, 1.0 +; NOOPT-NEXT: v_mov_b32_e32 v25, s24 +; NOOPT-NEXT: v_mov_b32_e32 v14, s23 +; NOOPT-NEXT: v_mov_b32_e32 v13, s22 +; NOOPT-NEXT: v_mov_b32_e32 v12, s21 +; NOOPT-NEXT: v_mov_b32_e32 v11, s20 +; NOOPT-NEXT: v_mov_b32_e32 v10, s19 +; NOOPT-NEXT: v_mov_b32_e32 v9, s18 +; NOOPT-NEXT: v_mov_b32_e32 v8, s17 +; NOOPT-NEXT: v_mov_b32_e32 v7, s16 +; NOOPT-NEXT: v_mov_b32_e32 v6, s13 +; NOOPT-NEXT: v_mov_b32_e32 v5, s12 +; NOOPT-NEXT: v_mov_b32_e32 v4, s11 +; NOOPT-NEXT: v_mov_b32_e32 v3, s10 +; NOOPT-NEXT: v_mov_b32_e32 v2, s9 +; NOOPT-NEXT: v_mov_b32_e32 v1, s8 +; NOOPT-NEXT: v_mov_b32_e32 v0, s7 +; NOOPT-NEXT: ; kill: def $vgpr25 killed $vgpr25 def $vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v26, v14 +; NOOPT-NEXT: v_mov_b32_e32 v27, v13 +; NOOPT-NEXT: v_mov_b32_e32 v28, v12 +; NOOPT-NEXT: v_mov_b32_e32 v29, v11 +; NOOPT-NEXT: v_mov_b32_e32 v30, v10 +; NOOPT-NEXT: v_mov_b32_e32 v31, v9 +; NOOPT-NEXT: v_mov_b32_e32 v32, v8 +; NOOPT-NEXT: v_mov_b32_e32 v33, v7 +; NOOPT-NEXT: v_mov_b32_e32 v34, v6 +; NOOPT-NEXT: v_mov_b32_e32 v35, v5 +; NOOPT-NEXT: v_mov_b32_e32 v36, v4 +; NOOPT-NEXT: v_mov_b32_e32 v37, v3 +; NOOPT-NEXT: v_mov_b32_e32 v38, v2 +; NOOPT-NEXT: v_mov_b32_e32 v39, v1 +; NOOPT-NEXT: v_mov_b32_e32 v40, v0 ; NOOPT-NEXT: v_mov_b32_e32 v0, 0x41880000 +; NOOPT-NEXT: s_mov_b32 m0, s6 +; NOOPT-NEXT: v_movreld_b32_e32 v25, v0 +; NOOPT-NEXT: s_mov_b32 s6, 2 +; NOOPT-NEXT: s_add_i32 s5, s5, s6 ; NOOPT-NEXT: s_mov_b32 m0, s5 -; NOOPT-NEXT: v_movreld_b32_e32 v23, v0 -; NOOPT-NEXT: s_mov_b32 s5, 2 -; NOOPT-NEXT: s_add_i32 s4, s4, s5 -; NOOPT-NEXT: s_mov_b32 m0, s4 -; NOOPT-NEXT: v_mov_b32_e32 v7, v23 -; NOOPT-NEXT: v_mov_b32_e32 v8, v24 -; NOOPT-NEXT: v_mov_b32_e32 v9, v25 -; NOOPT-NEXT: v_mov_b32_e32 v10, v26 -; NOOPT-NEXT: v_mov_b32_e32 v11, v27 -; NOOPT-NEXT: v_mov_b32_e32 v12, v28 -; NOOPT-NEXT: v_mov_b32_e32 v13, v29 -; NOOPT-NEXT: v_mov_b32_e32 v14, v30 -; NOOPT-NEXT: v_mov_b32_e32 v15, v31 -; NOOPT-NEXT: v_mov_b32_e32 v16, v32 -; NOOPT-NEXT: v_mov_b32_e32 v17, v33 -; NOOPT-NEXT: v_mov_b32_e32 v18, v34 -; NOOPT-NEXT: v_mov_b32_e32 v19, v35 -; NOOPT-NEXT: v_mov_b32_e32 v20, v36 -; NOOPT-NEXT: v_mov_b32_e32 v21, v37 -; NOOPT-NEXT: v_mov_b32_e32 v22, v38 +; NOOPT-NEXT: v_mov_b32_e32 v7, v25 +; NOOPT-NEXT: v_mov_b32_e32 v8, v26 +; NOOPT-NEXT: v_mov_b32_e32 v9, v27 +; NOOPT-NEXT: v_mov_b32_e32 v10, v28 +; NOOPT-NEXT: v_mov_b32_e32 v11, v29 +; NOOPT-NEXT: v_mov_b32_e32 v12, v30 +; NOOPT-NEXT: v_mov_b32_e32 v13, v31 +; NOOPT-NEXT: v_mov_b32_e32 v14, v32 +; NOOPT-NEXT: v_mov_b32_e32 v15, v33 +; NOOPT-NEXT: v_mov_b32_e32 v16, v34 +; NOOPT-NEXT: v_mov_b32_e32 v17, v35 +; NOOPT-NEXT: v_mov_b32_e32 v18, v36 +; NOOPT-NEXT: v_mov_b32_e32 v19, v37 +; NOOPT-NEXT: v_mov_b32_e32 v20, v38 +; NOOPT-NEXT: v_mov_b32_e32 v21, v39 +; NOOPT-NEXT: v_mov_b32_e32 v22, v40 ; NOOPT-NEXT: v_movreld_b32_e32 v7, v0 -; NOOPT-NEXT: v_mov_b32_e32 v4, v38 -; NOOPT-NEXT: v_mov_b32_e32 v5, v37 +; NOOPT-NEXT: v_mov_b32_e32 v4, v40 +; NOOPT-NEXT: v_mov_b32_e32 v5, v39 +; NOOPT-NEXT: v_mov_b32_e32 v6, v38 +; NOOPT-NEXT: v_mov_b32_e32 v0, v37 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v6 +; NOOPT-NEXT: v_mov_b32_e32 v2, v5 +; NOOPT-NEXT: v_mov_b32_e32 v3, v4 +; NOOPT-NEXT: s_mov_b32 s10, 0 +; NOOPT-NEXT: ; kill: def $sgpr10 killed $sgpr10 def $sgpr10_sgpr11 +; NOOPT-NEXT: s_mov_b32 s11, s4 +; NOOPT-NEXT: s_mov_b64 s[4:5], s[14:15] +; NOOPT-NEXT: s_mov_b64 s[6:7], s[10:11] +; NOOPT-NEXT: v_mov_b32_e32 v4, 32 +; NOOPT-NEXT: v_mov_b32_e32 v5, 0 +; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64 offset:16 ; NOOPT-NEXT: v_mov_b32_e32 v6, v36 -; NOOPT-NEXT: v_mov_b32_e32 v0, v35 -; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec -; NOOPT-NEXT: v_mov_b32_e32 v1, v6 -; NOOPT-NEXT: v_mov_b32_e32 v2, v5 -; NOOPT-NEXT: v_mov_b32_e32 v3, v4 -; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 -; NOOPT-NEXT: v_mov_b32_e32 v4, v34 -; NOOPT-NEXT: v_mov_b32_e32 v5, v33 -; NOOPT-NEXT: v_mov_b32_e32 v6, v32 +; NOOPT-NEXT: v_mov_b32_e32 v23, v35 +; NOOPT-NEXT: v_mov_b32_e32 v24, v34 ; NOOPT-NEXT: s_waitcnt expcnt(0) -; NOOPT-NEXT: v_mov_b32_e32 v0, v31 +; NOOPT-NEXT: v_mov_b32_e32 v0, v33 ; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec -; NOOPT-NEXT: v_mov_b32_e32 v1, v6 -; NOOPT-NEXT: v_mov_b32_e32 v2, v5 -; NOOPT-NEXT: v_mov_b32_e32 v3, v4 +; NOOPT-NEXT: v_mov_b32_e32 v1, v24 +; NOOPT-NEXT: v_mov_b32_e32 v2, v23 +; NOOPT-NEXT: v_mov_b32_e32 v3, v6 ; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 -; NOOPT-NEXT: v_mov_b32_e32 v4, v30 -; NOOPT-NEXT: v_mov_b32_e32 v5, v29 -; NOOPT-NEXT: v_mov_b32_e32 v6, v28 +; NOOPT-NEXT: v_mov_b32_e32 v6, v32 +; NOOPT-NEXT: v_mov_b32_e32 v23, v31 +; NOOPT-NEXT: v_mov_b32_e32 v24, v30 ; NOOPT-NEXT: s_waitcnt expcnt(0) -; NOOPT-NEXT: v_mov_b32_e32 v0, v27 +; NOOPT-NEXT: v_mov_b32_e32 v0, v29 ; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec -; NOOPT-NEXT: v_mov_b32_e32 v1, v6 -; NOOPT-NEXT: v_mov_b32_e32 v2, v5 -; NOOPT-NEXT: v_mov_b32_e32 v3, v4 +; NOOPT-NEXT: v_mov_b32_e32 v1, v24 +; NOOPT-NEXT: v_mov_b32_e32 v2, v23 +; NOOPT-NEXT: v_mov_b32_e32 v3, v6 ; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 -; NOOPT-NEXT: v_mov_b32_e32 v4, v26 -; NOOPT-NEXT: v_mov_b32_e32 v5, v25 -; NOOPT-NEXT: v_mov_b32_e32 v6, v24 +; NOOPT-NEXT: v_mov_b32_e32 v6, v28 +; NOOPT-NEXT: v_mov_b32_e32 v23, v27 +; NOOPT-NEXT: v_mov_b32_e32 v24, v26 ; NOOPT-NEXT: s_waitcnt expcnt(0) -; NOOPT-NEXT: v_mov_b32_e32 v0, v23 +; NOOPT-NEXT: v_mov_b32_e32 v0, v25 ; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec -; NOOPT-NEXT: v_mov_b32_e32 v1, v6 -; NOOPT-NEXT: v_mov_b32_e32 v2, v5 -; NOOPT-NEXT: v_mov_b32_e32 v3, v4 +; NOOPT-NEXT: v_mov_b32_e32 v1, v24 +; NOOPT-NEXT: v_mov_b32_e32 v2, v23 +; NOOPT-NEXT: v_mov_b32_e32 v3, v6 ; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 -; NOOPT-NEXT: v_mov_b32_e32 v4, v22 -; NOOPT-NEXT: v_mov_b32_e32 v5, v21 -; NOOPT-NEXT: v_mov_b32_e32 v6, v20 +; NOOPT-NEXT: s_mov_b64 s[8:9], 64 +; NOOPT-NEXT: s_mov_b32 s12, s14 +; NOOPT-NEXT: s_mov_b32 s13, s15 +; NOOPT-NEXT: s_mov_b32 s15, s8 +; NOOPT-NEXT: s_mov_b32 s14, s9 +; NOOPT-NEXT: s_add_u32 s12, s12, s15 +; NOOPT-NEXT: s_addc_u32 s14, s13, s14 +; NOOPT-NEXT: ; kill: def $sgpr12 killed $sgpr12 def $sgpr12_sgpr13 +; NOOPT-NEXT: s_mov_b32 s13, s14 +; NOOPT-NEXT: v_mov_b32_e32 v6, v22 +; NOOPT-NEXT: v_mov_b32_e32 v23, v21 +; NOOPT-NEXT: v_mov_b32_e32 v24, v20 ; NOOPT-NEXT: s_waitcnt expcnt(0) ; NOOPT-NEXT: v_mov_b32_e32 v0, v19 ; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec -; NOOPT-NEXT: v_mov_b32_e32 v1, v6 -; NOOPT-NEXT: v_mov_b32_e32 v2, v5 -; NOOPT-NEXT: v_mov_b32_e32 v3, v4 -; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 +; NOOPT-NEXT: v_mov_b32_e32 v1, v24 +; NOOPT-NEXT: v_mov_b32_e32 v2, v23 +; NOOPT-NEXT: v_mov_b32_e32 v3, v6 +; NOOPT-NEXT: ; kill: def $sgpr12_sgpr13 killed $sgpr12_sgpr13 def $sgpr12_sgpr13_sgpr14_sgpr15 +; NOOPT-NEXT: s_mov_b64 s[14:15], s[10:11] +; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], v[4:5], s[12:15], 0 addr64 offset:16 ; NOOPT-NEXT: v_mov_b32_e32 v4, v18 ; NOOPT-NEXT: v_mov_b32_e32 v5, v17 ; NOOPT-NEXT: v_mov_b32_e32 v6, v16 @@ -6839,7 +7064,9 @@ define amdgpu_kernel void @insert_w_offset_multiple_in_block(ptr addrspace(1) %o ; NOOPT-NEXT: v_mov_b32_e32 v1, v6 ; NOOPT-NEXT: v_mov_b32_e32 v2, v5 ; NOOPT-NEXT: v_mov_b32_e32 v3, v4 -; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 +; NOOPT-NEXT: v_mov_b32_e32 v4, s8 +; NOOPT-NEXT: v_mov_b32_e32 v5, s9 +; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64 offset:32 ; NOOPT-NEXT: v_mov_b32_e32 v4, v14 ; NOOPT-NEXT: v_mov_b32_e32 v5, v13 ; NOOPT-NEXT: v_mov_b32_e32 v6, v12 @@ -6849,7 +7076,9 @@ define amdgpu_kernel void @insert_w_offset_multiple_in_block(ptr addrspace(1) %o ; NOOPT-NEXT: v_mov_b32_e32 v1, v6 ; NOOPT-NEXT: v_mov_b32_e32 v2, v5 ; NOOPT-NEXT: v_mov_b32_e32 v3, v4 -; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 +; NOOPT-NEXT: v_mov_b32_e32 v4, s8 +; NOOPT-NEXT: v_mov_b32_e32 v5, s9 +; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64 offset:16 ; NOOPT-NEXT: v_mov_b32_e32 v4, v10 ; NOOPT-NEXT: v_mov_b32_e32 v5, v9 ; NOOPT-NEXT: v_mov_b32_e32 v6, v8 @@ -7201,11 +7430,12 @@ define amdgpu_kernel void @extract_adjacent_blocks(i32 %arg) { ; NOOPT-NEXT: s_mov_b32 s15, 0xe8f000 ; NOOPT-NEXT: s_add_u32 s12, s12, s11 ; NOOPT-NEXT: s_addc_u32 s13, s13, 0 +; NOOPT-NEXT: s_load_dword s0, s[4:5], 0x9 ; NOOPT-NEXT: s_load_dword s2, s[4:5], 0x9 +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) ; NOOPT-NEXT: s_mov_b64 s[0:1], -1 ; NOOPT-NEXT: ; implicit-def: $sgpr3 ; NOOPT-NEXT: s_mov_b32 s3, 0 -; NOOPT-NEXT: s_waitcnt lgkmcnt(0) ; NOOPT-NEXT: s_cmp_lg_u32 s2, s3 ; NOOPT-NEXT: ; implicit-def: $vgpr4 : SGPR spill to VGPR lane ; NOOPT-NEXT: v_writelane_b32 v4, s0, 0 @@ -7224,9 +7454,8 @@ define amdgpu_kernel void @extract_adjacent_blocks(i32 %arg) { ; NOOPT-NEXT: v_readlane_b32 s0, v4, 0 ; NOOPT-NEXT: v_readlane_b32 s1, v4, 1 ; NOOPT-NEXT: ; implicit-def: $sgpr2 -; NOOPT-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; NOOPT-NEXT: s_mov_b32 s0, 1 -; NOOPT-NEXT: v_cmp_ne_u32_e64 s[0:1], v0, s0 +; NOOPT-NEXT: s_mov_b64 s[2:3], -1 +; NOOPT-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] ; NOOPT-NEXT: s_and_b64 vcc, exec, s[0:1] ; NOOPT-NEXT: s_cbranch_vccnz .LBB19_4 ; NOOPT-NEXT: ; %bb.2: ; %bb1 @@ -7440,7 +7669,11 @@ define amdgpu_kernel void @insert_adjacent_blocks(i32 %arg, float %val0) { ; NOOPT-NEXT: s_mov_b32 s19, 0xe8f000 ; NOOPT-NEXT: s_add_u32 s16, s16, s11 ; NOOPT-NEXT: s_addc_u32 s17, s17, 0 +; NOOPT-NEXT: s_load_dword s0, s[4:5], 0x9 +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_load_dword s0, s[4:5], 0xa ; NOOPT-NEXT: s_load_dword s2, s[4:5], 0x9 +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) ; NOOPT-NEXT: s_load_dword s0, s[4:5], 0xa ; NOOPT-NEXT: s_waitcnt lgkmcnt(0) ; NOOPT-NEXT: s_mov_b64 s[0:1], -1 @@ -7464,9 +7697,8 @@ define amdgpu_kernel void @insert_adjacent_blocks(i32 %arg, float %val0) { ; NOOPT-NEXT: v_readlane_b32 s0, v4, 0 ; NOOPT-NEXT: v_readlane_b32 s1, v4, 1 ; NOOPT-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 -; NOOPT-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; NOOPT-NEXT: s_mov_b32 s0, 1 -; NOOPT-NEXT: v_cmp_ne_u32_e64 s[0:1], v0, s0 +; NOOPT-NEXT: s_mov_b64 s[2:3], -1 +; NOOPT-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] ; NOOPT-NEXT: s_and_b64 vcc, exec, s[0:1] ; NOOPT-NEXT: s_cbranch_vccnz .LBB20_4 ; NOOPT-NEXT: ; %bb.2: ; %bb1 @@ -7665,23 +7897,17 @@ define amdgpu_kernel void @multi_same_block(i32 %arg) { ; NOOPT-LABEL: multi_same_block: ; NOOPT: ; %bb.0: ; %bb ; NOOPT-NEXT: s_load_dword s0, s[4:5], 0x9 +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_load_dword s0, s[4:5], 0x9 +; NOOPT-NEXT: s_mov_b32 s1, 0x41c80000 +; NOOPT-NEXT: s_mov_b32 s2, 0x41c00000 +; NOOPT-NEXT: s_mov_b32 s3, 0x41b80000 +; NOOPT-NEXT: s_mov_b32 s4, 0x41b00000 +; NOOPT-NEXT: s_mov_b32 s5, 0x41a80000 +; NOOPT-NEXT: s_mov_b32 s6, 0x41a00000 +; NOOPT-NEXT: s_mov_b32 s7, 0x41980000 ; NOOPT-NEXT: s_mov_b32 s8, 0x41900000 -; NOOPT-NEXT: ; implicit-def: $sgpr9 -; NOOPT-NEXT: ; implicit-def: $sgpr1 -; NOOPT-NEXT: ; implicit-def: $sgpr7 -; NOOPT-NEXT: ; implicit-def: $sgpr1 -; NOOPT-NEXT: ; implicit-def: $sgpr6 -; NOOPT-NEXT: ; implicit-def: $sgpr1 -; NOOPT-NEXT: ; implicit-def: $sgpr5 -; NOOPT-NEXT: ; implicit-def: $sgpr1 -; NOOPT-NEXT: ; implicit-def: $sgpr4 -; NOOPT-NEXT: ; implicit-def: $sgpr1 -; NOOPT-NEXT: ; implicit-def: $sgpr3 -; NOOPT-NEXT: ; implicit-def: $sgpr1 -; NOOPT-NEXT: ; implicit-def: $sgpr2 -; NOOPT-NEXT: ; implicit-def: $sgpr1 -; NOOPT-NEXT: ; implicit-def: $sgpr1 -; NOOPT-NEXT: ; implicit-def: $sgpr10 +; NOOPT-NEXT: s_mov_b32 s9, 0x41880000 ; NOOPT-NEXT: v_mov_b32_e32 v12, s9 ; NOOPT-NEXT: v_mov_b32_e32 v7, s8 ; NOOPT-NEXT: v_mov_b32_e32 v6, s7 @@ -7704,23 +7930,15 @@ define amdgpu_kernel void @multi_same_block(i32 %arg) { ; NOOPT-NEXT: s_waitcnt lgkmcnt(0) ; NOOPT-NEXT: s_add_i32 m0, s0, -16 ; NOOPT-NEXT: v_movreld_b32_e32 v12, v0 +; NOOPT-NEXT: s_mov_b32 s1, 0x41c8cccd +; NOOPT-NEXT: s_mov_b32 s2, 0x41c0cccd +; NOOPT-NEXT: s_mov_b32 s3, 0x41b8cccd ; NOOPT-NEXT: s_mov_b32 s4, 0x41b0cccd -; NOOPT-NEXT: ; implicit-def: $sgpr9 -; NOOPT-NEXT: ; implicit-def: $sgpr1 -; NOOPT-NEXT: ; implicit-def: $sgpr8 -; NOOPT-NEXT: ; implicit-def: $sgpr1 -; NOOPT-NEXT: ; implicit-def: $sgpr7 -; NOOPT-NEXT: ; implicit-def: $sgpr1 -; NOOPT-NEXT: ; implicit-def: $sgpr6 -; NOOPT-NEXT: ; implicit-def: $sgpr1 -; NOOPT-NEXT: ; implicit-def: $sgpr5 -; NOOPT-NEXT: ; implicit-def: $sgpr1 -; NOOPT-NEXT: ; implicit-def: $sgpr3 -; NOOPT-NEXT: ; implicit-def: $sgpr1 -; NOOPT-NEXT: ; implicit-def: $sgpr2 -; NOOPT-NEXT: ; implicit-def: $sgpr1 -; NOOPT-NEXT: ; implicit-def: $sgpr1 -; NOOPT-NEXT: ; implicit-def: $sgpr10 +; NOOPT-NEXT: s_mov_b32 s5, 0x41a8cccd +; NOOPT-NEXT: s_mov_b32 s6, 0x41a0cccd +; NOOPT-NEXT: s_mov_b32 s7, 0x4198cccd +; NOOPT-NEXT: s_mov_b32 s8, 0x4190cccd +; NOOPT-NEXT: s_mov_b32 s9, 0x4188cccd ; NOOPT-NEXT: v_mov_b32_e32 v3, s9 ; NOOPT-NEXT: v_mov_b32_e32 v25, s8 ; NOOPT-NEXT: v_mov_b32_e32 v24, s7 @@ -7901,6 +8119,12 @@ define amdgpu_kernel void @extract_largest_inbounds_offset(ptr addrspace(1) %out ; NOOPT-LABEL: extract_largest_inbounds_offset: ; NOOPT: ; %bb.0: ; %entry ; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_load_dword s0, s[4:5], 0xd +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; NOOPT-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xb ; NOOPT-NEXT: s_load_dword s4, s[4:5], 0xd ; NOOPT-NEXT: s_waitcnt lgkmcnt(0) @@ -7913,35 +8137,74 @@ define amdgpu_kernel void @extract_largest_inbounds_offset(ptr addrspace(1) %out ; NOOPT-NEXT: s_mov_b32 s2, s6 ; NOOPT-NEXT: s_mov_b32 s3, s5 ; NOOPT-NEXT: s_mov_b32 s7, s9 -; NOOPT-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9 -; NOOPT-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11 -; NOOPT-NEXT: s_mov_b32 s9, s7 -; NOOPT-NEXT: s_mov_b32 s10, s6 -; NOOPT-NEXT: s_mov_b32 s11, s5 -; NOOPT-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 glc +; NOOPT-NEXT: s_mov_b32 s12, s8 +; NOOPT-NEXT: ; kill: def $sgpr12 killed $sgpr12 def $sgpr12_sgpr13_sgpr14_sgpr15 +; NOOPT-NEXT: s_mov_b32 s13, s7 +; NOOPT-NEXT: s_mov_b32 s14, s6 +; NOOPT-NEXT: s_mov_b32 s15, s5 +; NOOPT-NEXT: buffer_load_dwordx4 v[5:8], off, s[12:15], 0 glc ; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 glc +; NOOPT-NEXT: buffer_load_dwordx4 v[9:12], off, s[12:15], 0 offset:16 glc ; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 glc +; NOOPT-NEXT: buffer_load_dwordx4 v[1:4], off, s[12:15], 0 offset:32 glc ; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48 glc +; NOOPT-NEXT: v_mov_b32_e32 v0, v12 +; NOOPT-NEXT: v_mov_b32_e32 v13, v11 +; NOOPT-NEXT: v_mov_b32_e32 v14, v10 +; NOOPT-NEXT: v_mov_b32_e32 v15, v9 +; NOOPT-NEXT: v_mov_b32_e32 v16, v8 +; NOOPT-NEXT: v_mov_b32_e32 v17, v7 +; NOOPT-NEXT: v_mov_b32_e32 v18, v6 +; NOOPT-NEXT: ; kill: def $vgpr5 killed $vgpr5 killed $vgpr5_vgpr6_vgpr7_vgpr8 killed $exec +; NOOPT-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v6, v18 +; NOOPT-NEXT: v_mov_b32_e32 v7, v17 +; NOOPT-NEXT: v_mov_b32_e32 v8, v16 +; NOOPT-NEXT: v_mov_b32_e32 v9, v15 +; NOOPT-NEXT: v_mov_b32_e32 v10, v14 +; NOOPT-NEXT: v_mov_b32_e32 v11, v13 +; NOOPT-NEXT: v_mov_b32_e32 v12, v0 +; NOOPT-NEXT: v_mov_b32_e32 v24, v12 +; NOOPT-NEXT: v_mov_b32_e32 v25, v11 +; NOOPT-NEXT: v_mov_b32_e32 v26, v10 +; NOOPT-NEXT: v_mov_b32_e32 v27, v9 +; NOOPT-NEXT: v_mov_b32_e32 v28, v8 +; NOOPT-NEXT: v_mov_b32_e32 v29, v7 +; NOOPT-NEXT: v_mov_b32_e32 v30, v6 +; NOOPT-NEXT: v_mov_b32_e32 v0, v5 +; NOOPT-NEXT: v_mov_b32_e32 v13, v4 +; NOOPT-NEXT: v_mov_b32_e32 v14, v3 +; NOOPT-NEXT: v_mov_b32_e32 v15, v2 +; NOOPT-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec +; NOOPT-NEXT: s_mov_b32 s6, 0 +; NOOPT-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; NOOPT-NEXT: s_mov_b32 s7, s5 +; NOOPT-NEXT: ; kill: def $sgpr8_sgpr9 killed $sgpr8_sgpr9 def $sgpr8_sgpr9_sgpr10_sgpr11 +; NOOPT-NEXT: s_mov_b64 s[10:11], s[6:7] +; NOOPT-NEXT: v_mov_b32_e32 v2, 32 +; NOOPT-NEXT: v_mov_b32_e32 v3, 0 +; NOOPT-NEXT: buffer_load_dwordx4 v[2:5], v[2:3], s[8:11], 0 addr64 offset:16 glc ; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_mov_b32_e32 v16, v15 -; NOOPT-NEXT: v_mov_b32_e32 v17, v14 -; NOOPT-NEXT: v_mov_b32_e32 v18, v13 -; NOOPT-NEXT: v_mov_b32_e32 v19, v12 -; NOOPT-NEXT: v_mov_b32_e32 v20, v11 -; NOOPT-NEXT: v_mov_b32_e32 v21, v10 -; NOOPT-NEXT: v_mov_b32_e32 v22, v9 -; NOOPT-NEXT: v_mov_b32_e32 v23, v8 -; NOOPT-NEXT: v_mov_b32_e32 v24, v7 -; NOOPT-NEXT: v_mov_b32_e32 v25, v6 -; NOOPT-NEXT: v_mov_b32_e32 v26, v5 -; NOOPT-NEXT: v_mov_b32_e32 v27, v4 -; NOOPT-NEXT: v_mov_b32_e32 v28, v3 -; NOOPT-NEXT: v_mov_b32_e32 v29, v2 -; NOOPT-NEXT: v_mov_b32_e32 v30, v1 -; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v9, v5 +; NOOPT-NEXT: v_mov_b32_e32 v10, v4 +; NOOPT-NEXT: v_mov_b32_e32 v11, v3 +; NOOPT-NEXT: v_mov_b32_e32 v12, v2 +; NOOPT-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v2, v15 +; NOOPT-NEXT: v_mov_b32_e32 v3, v14 +; NOOPT-NEXT: v_mov_b32_e32 v4, v13 +; NOOPT-NEXT: v_mov_b32_e32 v5, v12 +; NOOPT-NEXT: v_mov_b32_e32 v6, v11 +; NOOPT-NEXT: v_mov_b32_e32 v7, v10 +; NOOPT-NEXT: v_mov_b32_e32 v8, v9 +; NOOPT-NEXT: v_mov_b32_e32 v16, v8 +; NOOPT-NEXT: v_mov_b32_e32 v17, v7 +; NOOPT-NEXT: v_mov_b32_e32 v18, v6 +; NOOPT-NEXT: v_mov_b32_e32 v19, v5 +; NOOPT-NEXT: v_mov_b32_e32 v20, v4 +; NOOPT-NEXT: v_mov_b32_e32 v21, v3 +; NOOPT-NEXT: v_mov_b32_e32 v22, v2 +; NOOPT-NEXT: v_mov_b32_e32 v23, v1 ; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 killed $exec ; NOOPT-NEXT: v_mov_b32_e32 v1, v30 ; NOOPT-NEXT: v_mov_b32_e32 v2, v29 @@ -8163,6 +8426,12 @@ define amdgpu_kernel void @extract_out_of_bounds_offset(ptr addrspace(1) %out, p ; NOOPT-LABEL: extract_out_of_bounds_offset: ; NOOPT: ; %bb.0: ; %entry ; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_load_dword s0, s[4:5], 0xd +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; NOOPT-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xb ; NOOPT-NEXT: s_load_dword s4, s[4:5], 0xd ; NOOPT-NEXT: s_waitcnt lgkmcnt(0) @@ -8175,35 +8444,74 @@ define amdgpu_kernel void @extract_out_of_bounds_offset(ptr addrspace(1) %out, p ; NOOPT-NEXT: s_mov_b32 s2, s6 ; NOOPT-NEXT: s_mov_b32 s3, s5 ; NOOPT-NEXT: s_mov_b32 s7, s9 -; NOOPT-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9 -; NOOPT-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11 -; NOOPT-NEXT: s_mov_b32 s9, s7 -; NOOPT-NEXT: s_mov_b32 s10, s6 -; NOOPT-NEXT: s_mov_b32 s11, s5 -; NOOPT-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 glc +; NOOPT-NEXT: s_mov_b32 s12, s8 +; NOOPT-NEXT: ; kill: def $sgpr12 killed $sgpr12 def $sgpr12_sgpr13_sgpr14_sgpr15 +; NOOPT-NEXT: s_mov_b32 s13, s7 +; NOOPT-NEXT: s_mov_b32 s14, s6 +; NOOPT-NEXT: s_mov_b32 s15, s5 +; NOOPT-NEXT: buffer_load_dwordx4 v[5:8], off, s[12:15], 0 glc ; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 glc +; NOOPT-NEXT: buffer_load_dwordx4 v[9:12], off, s[12:15], 0 offset:16 glc ; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 glc +; NOOPT-NEXT: buffer_load_dwordx4 v[1:4], off, s[12:15], 0 offset:32 glc ; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48 glc +; NOOPT-NEXT: v_mov_b32_e32 v0, v12 +; NOOPT-NEXT: v_mov_b32_e32 v13, v11 +; NOOPT-NEXT: v_mov_b32_e32 v14, v10 +; NOOPT-NEXT: v_mov_b32_e32 v15, v9 +; NOOPT-NEXT: v_mov_b32_e32 v16, v8 +; NOOPT-NEXT: v_mov_b32_e32 v17, v7 +; NOOPT-NEXT: v_mov_b32_e32 v18, v6 +; NOOPT-NEXT: ; kill: def $vgpr5 killed $vgpr5 killed $vgpr5_vgpr6_vgpr7_vgpr8 killed $exec +; NOOPT-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v6, v18 +; NOOPT-NEXT: v_mov_b32_e32 v7, v17 +; NOOPT-NEXT: v_mov_b32_e32 v8, v16 +; NOOPT-NEXT: v_mov_b32_e32 v9, v15 +; NOOPT-NEXT: v_mov_b32_e32 v10, v14 +; NOOPT-NEXT: v_mov_b32_e32 v11, v13 +; NOOPT-NEXT: v_mov_b32_e32 v12, v0 +; NOOPT-NEXT: v_mov_b32_e32 v24, v12 +; NOOPT-NEXT: v_mov_b32_e32 v25, v11 +; NOOPT-NEXT: v_mov_b32_e32 v26, v10 +; NOOPT-NEXT: v_mov_b32_e32 v27, v9 +; NOOPT-NEXT: v_mov_b32_e32 v28, v8 +; NOOPT-NEXT: v_mov_b32_e32 v29, v7 +; NOOPT-NEXT: v_mov_b32_e32 v30, v6 +; NOOPT-NEXT: v_mov_b32_e32 v0, v5 +; NOOPT-NEXT: v_mov_b32_e32 v13, v4 +; NOOPT-NEXT: v_mov_b32_e32 v14, v3 +; NOOPT-NEXT: v_mov_b32_e32 v15, v2 +; NOOPT-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec +; NOOPT-NEXT: s_mov_b32 s6, 0 +; NOOPT-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; NOOPT-NEXT: s_mov_b32 s7, s5 +; NOOPT-NEXT: ; kill: def $sgpr8_sgpr9 killed $sgpr8_sgpr9 def $sgpr8_sgpr9_sgpr10_sgpr11 +; NOOPT-NEXT: s_mov_b64 s[10:11], s[6:7] +; NOOPT-NEXT: v_mov_b32_e32 v2, 32 +; NOOPT-NEXT: v_mov_b32_e32 v3, 0 +; NOOPT-NEXT: buffer_load_dwordx4 v[2:5], v[2:3], s[8:11], 0 addr64 offset:16 glc ; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_mov_b32_e32 v16, v15 -; NOOPT-NEXT: v_mov_b32_e32 v17, v14 -; NOOPT-NEXT: v_mov_b32_e32 v18, v13 -; NOOPT-NEXT: v_mov_b32_e32 v19, v12 -; NOOPT-NEXT: v_mov_b32_e32 v20, v11 -; NOOPT-NEXT: v_mov_b32_e32 v21, v10 -; NOOPT-NEXT: v_mov_b32_e32 v22, v9 -; NOOPT-NEXT: v_mov_b32_e32 v23, v8 -; NOOPT-NEXT: v_mov_b32_e32 v24, v7 -; NOOPT-NEXT: v_mov_b32_e32 v25, v6 -; NOOPT-NEXT: v_mov_b32_e32 v26, v5 -; NOOPT-NEXT: v_mov_b32_e32 v27, v4 -; NOOPT-NEXT: v_mov_b32_e32 v28, v3 -; NOOPT-NEXT: v_mov_b32_e32 v29, v2 -; NOOPT-NEXT: v_mov_b32_e32 v30, v1 -; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v9, v5 +; NOOPT-NEXT: v_mov_b32_e32 v10, v4 +; NOOPT-NEXT: v_mov_b32_e32 v11, v3 +; NOOPT-NEXT: v_mov_b32_e32 v12, v2 +; NOOPT-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v2, v15 +; NOOPT-NEXT: v_mov_b32_e32 v3, v14 +; NOOPT-NEXT: v_mov_b32_e32 v4, v13 +; NOOPT-NEXT: v_mov_b32_e32 v5, v12 +; NOOPT-NEXT: v_mov_b32_e32 v6, v11 +; NOOPT-NEXT: v_mov_b32_e32 v7, v10 +; NOOPT-NEXT: v_mov_b32_e32 v8, v9 +; NOOPT-NEXT: v_mov_b32_e32 v16, v8 +; NOOPT-NEXT: v_mov_b32_e32 v17, v7 +; NOOPT-NEXT: v_mov_b32_e32 v18, v6 +; NOOPT-NEXT: v_mov_b32_e32 v19, v5 +; NOOPT-NEXT: v_mov_b32_e32 v20, v4 +; NOOPT-NEXT: v_mov_b32_e32 v21, v3 +; NOOPT-NEXT: v_mov_b32_e32 v22, v2 +; NOOPT-NEXT: v_mov_b32_e32 v23, v1 ; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 killed $exec ; NOOPT-NEXT: v_mov_b32_e32 v1, v30 ; NOOPT-NEXT: v_mov_b32_e32 v2, v29 @@ -8426,6 +8734,12 @@ define amdgpu_kernel void @extractelement_v16i32_or_index(ptr addrspace(1) %out, ; NOOPT-LABEL: extractelement_v16i32_or_index: ; NOOPT: ; %bb.0: ; %entry ; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_load_dword s0, s[4:5], 0xd +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; NOOPT-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xb ; NOOPT-NEXT: s_load_dword s4, s[4:5], 0xd ; NOOPT-NEXT: s_waitcnt lgkmcnt(0) @@ -8438,35 +8752,74 @@ define amdgpu_kernel void @extractelement_v16i32_or_index(ptr addrspace(1) %out, ; NOOPT-NEXT: s_mov_b32 s2, s6 ; NOOPT-NEXT: s_mov_b32 s3, s5 ; NOOPT-NEXT: s_mov_b32 s7, s9 -; NOOPT-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9 -; NOOPT-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11 -; NOOPT-NEXT: s_mov_b32 s9, s7 -; NOOPT-NEXT: s_mov_b32 s10, s6 -; NOOPT-NEXT: s_mov_b32 s11, s5 -; NOOPT-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 glc +; NOOPT-NEXT: s_mov_b32 s12, s8 +; NOOPT-NEXT: ; kill: def $sgpr12 killed $sgpr12 def $sgpr12_sgpr13_sgpr14_sgpr15 +; NOOPT-NEXT: s_mov_b32 s13, s7 +; NOOPT-NEXT: s_mov_b32 s14, s6 +; NOOPT-NEXT: s_mov_b32 s15, s5 +; NOOPT-NEXT: buffer_load_dwordx4 v[5:8], off, s[12:15], 0 glc ; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 glc +; NOOPT-NEXT: buffer_load_dwordx4 v[9:12], off, s[12:15], 0 offset:16 glc ; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 glc +; NOOPT-NEXT: buffer_load_dwordx4 v[1:4], off, s[12:15], 0 offset:32 glc ; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48 glc +; NOOPT-NEXT: v_mov_b32_e32 v0, v12 +; NOOPT-NEXT: v_mov_b32_e32 v13, v11 +; NOOPT-NEXT: v_mov_b32_e32 v14, v10 +; NOOPT-NEXT: v_mov_b32_e32 v15, v9 +; NOOPT-NEXT: v_mov_b32_e32 v16, v8 +; NOOPT-NEXT: v_mov_b32_e32 v17, v7 +; NOOPT-NEXT: v_mov_b32_e32 v18, v6 +; NOOPT-NEXT: ; kill: def $vgpr5 killed $vgpr5 killed $vgpr5_vgpr6_vgpr7_vgpr8 killed $exec +; NOOPT-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v6, v18 +; NOOPT-NEXT: v_mov_b32_e32 v7, v17 +; NOOPT-NEXT: v_mov_b32_e32 v8, v16 +; NOOPT-NEXT: v_mov_b32_e32 v9, v15 +; NOOPT-NEXT: v_mov_b32_e32 v10, v14 +; NOOPT-NEXT: v_mov_b32_e32 v11, v13 +; NOOPT-NEXT: v_mov_b32_e32 v12, v0 +; NOOPT-NEXT: v_mov_b32_e32 v24, v12 +; NOOPT-NEXT: v_mov_b32_e32 v25, v11 +; NOOPT-NEXT: v_mov_b32_e32 v26, v10 +; NOOPT-NEXT: v_mov_b32_e32 v27, v9 +; NOOPT-NEXT: v_mov_b32_e32 v28, v8 +; NOOPT-NEXT: v_mov_b32_e32 v29, v7 +; NOOPT-NEXT: v_mov_b32_e32 v30, v6 +; NOOPT-NEXT: v_mov_b32_e32 v0, v5 +; NOOPT-NEXT: v_mov_b32_e32 v13, v4 +; NOOPT-NEXT: v_mov_b32_e32 v14, v3 +; NOOPT-NEXT: v_mov_b32_e32 v15, v2 +; NOOPT-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec +; NOOPT-NEXT: s_mov_b32 s6, 0 +; NOOPT-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; NOOPT-NEXT: s_mov_b32 s7, s5 +; NOOPT-NEXT: ; kill: def $sgpr8_sgpr9 killed $sgpr8_sgpr9 def $sgpr8_sgpr9_sgpr10_sgpr11 +; NOOPT-NEXT: s_mov_b64 s[10:11], s[6:7] +; NOOPT-NEXT: v_mov_b32_e32 v2, 32 +; NOOPT-NEXT: v_mov_b32_e32 v3, 0 +; NOOPT-NEXT: buffer_load_dwordx4 v[2:5], v[2:3], s[8:11], 0 addr64 offset:16 glc ; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_mov_b32_e32 v16, v15 -; NOOPT-NEXT: v_mov_b32_e32 v17, v14 -; NOOPT-NEXT: v_mov_b32_e32 v18, v13 -; NOOPT-NEXT: v_mov_b32_e32 v19, v12 -; NOOPT-NEXT: v_mov_b32_e32 v20, v11 -; NOOPT-NEXT: v_mov_b32_e32 v21, v10 -; NOOPT-NEXT: v_mov_b32_e32 v22, v9 -; NOOPT-NEXT: v_mov_b32_e32 v23, v8 -; NOOPT-NEXT: v_mov_b32_e32 v24, v7 -; NOOPT-NEXT: v_mov_b32_e32 v25, v6 -; NOOPT-NEXT: v_mov_b32_e32 v26, v5 -; NOOPT-NEXT: v_mov_b32_e32 v27, v4 -; NOOPT-NEXT: v_mov_b32_e32 v28, v3 -; NOOPT-NEXT: v_mov_b32_e32 v29, v2 -; NOOPT-NEXT: v_mov_b32_e32 v30, v1 -; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v9, v5 +; NOOPT-NEXT: v_mov_b32_e32 v10, v4 +; NOOPT-NEXT: v_mov_b32_e32 v11, v3 +; NOOPT-NEXT: v_mov_b32_e32 v12, v2 +; NOOPT-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v2, v15 +; NOOPT-NEXT: v_mov_b32_e32 v3, v14 +; NOOPT-NEXT: v_mov_b32_e32 v4, v13 +; NOOPT-NEXT: v_mov_b32_e32 v5, v12 +; NOOPT-NEXT: v_mov_b32_e32 v6, v11 +; NOOPT-NEXT: v_mov_b32_e32 v7, v10 +; NOOPT-NEXT: v_mov_b32_e32 v8, v9 +; NOOPT-NEXT: v_mov_b32_e32 v16, v8 +; NOOPT-NEXT: v_mov_b32_e32 v17, v7 +; NOOPT-NEXT: v_mov_b32_e32 v18, v6 +; NOOPT-NEXT: v_mov_b32_e32 v19, v5 +; NOOPT-NEXT: v_mov_b32_e32 v20, v4 +; NOOPT-NEXT: v_mov_b32_e32 v21, v3 +; NOOPT-NEXT: v_mov_b32_e32 v22, v2 +; NOOPT-NEXT: v_mov_b32_e32 v23, v1 ; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 killed $exec ; NOOPT-NEXT: v_mov_b32_e32 v1, v30 ; NOOPT-NEXT: v_mov_b32_e32 v2, v29 @@ -8701,20 +9054,25 @@ define amdgpu_kernel void @insertelement_v16f32_or_index(ptr addrspace(1) %out, ; ; NOOPT-LABEL: insertelement_v16f32_or_index: ; NOOPT: ; %bb.0: -; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; NOOPT-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x19 -; NOOPT-NEXT: s_load_dword s4, s[4:5], 0x29 +; NOOPT-NEXT: s_mov_b64 s[0:1], s[4:5] +; NOOPT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; NOOPT-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x19 ; NOOPT-NEXT: s_waitcnt lgkmcnt(0) -; NOOPT-NEXT: s_mov_b32 s7, s1 -; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 -; NOOPT-NEXT: s_mov_b32 s5, 0xf000 -; NOOPT-NEXT: s_mov_b32 s6, -1 +; NOOPT-NEXT: s_load_dword s2, s[0:1], 0x29 +; NOOPT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; NOOPT-NEXT: s_load_dwordx16 s[8:23], s[0:1], 0x19 +; NOOPT-NEXT: s_load_dword s7, s[0:1], 0x29 +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_mov_b32 s25, s5 +; NOOPT-NEXT: s_mov_b32 s0, s4 +; NOOPT-NEXT: s_mov_b32 s6, 0xf000 +; NOOPT-NEXT: s_mov_b32 s24, -1 ; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 -; NOOPT-NEXT: s_mov_b32 s1, s7 -; NOOPT-NEXT: s_mov_b32 s2, s6 -; NOOPT-NEXT: s_mov_b32 s3, s5 -; NOOPT-NEXT: s_mov_b32 s5, 2 -; NOOPT-NEXT: s_lshl_b32 s4, s4, s5 +; NOOPT-NEXT: s_mov_b32 s1, s25 +; NOOPT-NEXT: s_mov_b32 s2, s24 +; NOOPT-NEXT: s_mov_b32 s3, s6 +; NOOPT-NEXT: s_mov_b32 s24, 2 +; NOOPT-NEXT: s_lshl_b32 s7, s7, s24 ; NOOPT-NEXT: v_mov_b32_e32 v0, 0x40a00000 ; NOOPT-NEXT: v_mov_b32_e32 v7, s8 ; NOOPT-NEXT: v_mov_b32_e32 v8, s9 @@ -8732,7 +9090,7 @@ define amdgpu_kernel void @insertelement_v16f32_or_index(ptr addrspace(1) %out, ; NOOPT-NEXT: v_mov_b32_e32 v20, s21 ; NOOPT-NEXT: v_mov_b32_e32 v21, s22 ; NOOPT-NEXT: v_mov_b32_e32 v22, s23 -; NOOPT-NEXT: s_mov_b32 m0, s4 +; NOOPT-NEXT: s_mov_b32 m0, s7 ; NOOPT-NEXT: v_movreld_b32_e32 v8, v0 ; NOOPT-NEXT: v_mov_b32_e32 v4, v22 ; NOOPT-NEXT: v_mov_b32_e32 v5, v21 @@ -8742,7 +9100,14 @@ define amdgpu_kernel void @insertelement_v16f32_or_index(ptr addrspace(1) %out, ; NOOPT-NEXT: v_mov_b32_e32 v1, v6 ; NOOPT-NEXT: v_mov_b32_e32 v2, v5 ; NOOPT-NEXT: v_mov_b32_e32 v3, v4 -; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; NOOPT-NEXT: s_mov_b32 s8, 0 +; NOOPT-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9 +; NOOPT-NEXT: s_mov_b32 s9, s6 +; NOOPT-NEXT: ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7 +; NOOPT-NEXT: s_mov_b64 s[6:7], s[8:9] +; NOOPT-NEXT: v_mov_b32_e32 v4, 32 +; NOOPT-NEXT: v_mov_b32_e32 v5, 0 +; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64 offset:16 ; NOOPT-NEXT: v_mov_b32_e32 v4, v18 ; NOOPT-NEXT: v_mov_b32_e32 v5, v17 ; NOOPT-NEXT: v_mov_b32_e32 v6, v16 @@ -8983,12 +9348,16 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) { ; NOOPT-NEXT: s_mov_b32 s27, 0xe8f000 ; NOOPT-NEXT: s_add_u32 s24, s24, s11 ; NOOPT-NEXT: s_addc_u32 s25, s25, 0 +; NOOPT-NEXT: s_load_dword s0, s[4:5], 0x9 +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_load_dword s0, s[4:5], 0xa ; NOOPT-NEXT: s_load_dword s1, s[4:5], 0x9 +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) ; NOOPT-NEXT: s_load_dword s0, s[4:5], 0xa ; NOOPT-NEXT: ; implicit-def: $vgpr18 : SGPR spill to VGPR lane -; NOOPT-NEXT: s_waitcnt lgkmcnt(0) ; NOOPT-NEXT: v_writelane_b32 v18, s1, 0 ; NOOPT-NEXT: s_mov_b32 s1, 8 +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) ; NOOPT-NEXT: v_writelane_b32 v18, s0, 1 ; NOOPT-NEXT: s_or_saveexec_b64 s[20:21], -1 ; NOOPT-NEXT: buffer_store_dword v18, off, s[24:27], 0 ; 4-byte Folded Spill @@ -9003,15 +9372,16 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) { ; NOOPT-NEXT: buffer_load_dword v18, off, s[24:27], 0 ; 4-byte Folded Reload ; NOOPT-NEXT: s_mov_b64 exec, s[20:21] ; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_readlane_b32 s2, v18, 0 +; NOOPT-NEXT: v_readlane_b32 s3, v18, 0 ; NOOPT-NEXT: s_waitcnt expcnt(0) ; NOOPT-NEXT: buffer_load_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload ; NOOPT-NEXT: s_mov_b64 s[0:1], -1 -; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; implicit-def: $sgpr2 ; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_cmp_ge_i32_e64 s[2:3], v0, s2 -; NOOPT-NEXT: v_mov_b32_e32 v0, s4 -; NOOPT-NEXT: s_and_b64 vcc, exec, s[2:3] +; NOOPT-NEXT: v_cmp_lt_i32_e64 s[4:5], v0, s3 +; NOOPT-NEXT: s_xor_b64 s[4:5], s[4:5], s[0:1] +; NOOPT-NEXT: s_and_b64 vcc, exec, s[4:5] +; NOOPT-NEXT: v_mov_b32_e32 v0, s2 ; NOOPT-NEXT: buffer_store_dword v0, off, s[24:27], 0 offset:8 ; 4-byte Folded Spill ; NOOPT-NEXT: v_writelane_b32 v18, s0, 2 ; NOOPT-NEXT: v_writelane_b32 v18, s1, 3 @@ -9210,9 +9580,8 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) { ; NOOPT-NEXT: v_readlane_b32 s0, v18, 2 ; NOOPT-NEXT: v_readlane_b32 s1, v18, 3 ; NOOPT-NEXT: buffer_load_dword v0, off, s[24:27], 0 offset:8 ; 4-byte Folded Reload -; NOOPT-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] -; NOOPT-NEXT: s_mov_b32 s0, 1 -; NOOPT-NEXT: v_cmp_ne_u32_e64 s[0:1], v1, s0 +; NOOPT-NEXT: s_mov_b64 s[2:3], -1 +; NOOPT-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] ; NOOPT-NEXT: s_and_b64 vcc, exec, s[0:1] ; NOOPT-NEXT: s_waitcnt vmcnt(0) ; NOOPT-NEXT: buffer_store_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Spill @@ -9475,23 +9844,25 @@ define amdgpu_cs void @insert_or_disj_index(ptr addrspace(1) %out, ptr addrspace ; NOOPT-NEXT: s_mov_b32 s5, s0 ; NOOPT-NEXT: s_mov_b32 s6, s2 ; NOOPT-NEXT: s_mov_b32 s7, s3 -; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec -; NOOPT-NEXT: v_mov_b32_e32 v1, v2 -; NOOPT-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:136 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:140 ; 4-byte Folded Spill ; NOOPT-NEXT: s_mov_b32 s8, 0xf000 ; NOOPT-NEXT: s_mov_b32 s0, 0 ; NOOPT-NEXT: v_writelane_b32 v33, s0, 2 ; NOOPT-NEXT: s_mov_b32 s2, s0 ; NOOPT-NEXT: s_mov_b32 s3, s8 +; NOOPT-NEXT: v_writelane_b32 v33, s2, 3 +; NOOPT-NEXT: v_writelane_b32 v33, s3, 4 ; NOOPT-NEXT: s_mov_b32 s8, s0 ; NOOPT-NEXT: s_mov_b32 s9, s0 ; NOOPT-NEXT: ; kill: def $sgpr8_sgpr9 killed $sgpr8_sgpr9 def $sgpr8_sgpr9_sgpr10_sgpr11 ; NOOPT-NEXT: s_mov_b64 s[10:11], s[2:3] -; NOOPT-NEXT: v_writelane_b32 v33, s8, 3 -; NOOPT-NEXT: v_writelane_b32 v33, s9, 4 -; NOOPT-NEXT: v_writelane_b32 v33, s10, 5 -; NOOPT-NEXT: v_writelane_b32 v33, s11, 6 +; NOOPT-NEXT: v_writelane_b32 v33, s8, 5 +; NOOPT-NEXT: v_writelane_b32 v33, s9, 6 +; NOOPT-NEXT: v_writelane_b32 v33, s10, 7 +; NOOPT-NEXT: v_writelane_b32 v33, s11, 8 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v2 +; NOOPT-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:136 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:140 ; 4-byte Folded Spill ; NOOPT-NEXT: ; kill: def $sgpr8_sgpr9_sgpr10_sgpr11 killed $sgpr4_sgpr5_sgpr6_sgpr7 ; NOOPT-NEXT: s_waitcnt expcnt(1) ; NOOPT-NEXT: v_mov_b32_e32 v0, s1 @@ -9548,8 +9919,8 @@ define amdgpu_cs void @insert_or_disj_index(ptr addrspace(1) %out, ptr addrspace ; NOOPT-NEXT: buffer_store_dword v14, off, s[16:19], 0 offset:124 ; 4-byte Folded Spill ; NOOPT-NEXT: buffer_store_dword v15, off, s[16:19], 0 offset:128 ; 4-byte Folded Spill ; NOOPT-NEXT: s_mov_b64 s[0:1], exec -; NOOPT-NEXT: v_writelane_b32 v33, s0, 7 -; NOOPT-NEXT: v_writelane_b32 v33, s1, 8 +; NOOPT-NEXT: v_writelane_b32 v33, s0, 9 +; NOOPT-NEXT: v_writelane_b32 v33, s1, 10 ; NOOPT-NEXT: s_or_saveexec_b64 s[12:13], -1 ; NOOPT-NEXT: buffer_store_dword v33, off, s[16:19], 0 ; 4-byte Folded Spill ; NOOPT-NEXT: s_mov_b64 exec, s[12:13] @@ -9576,8 +9947,8 @@ define amdgpu_cs void @insert_or_disj_index(ptr addrspace(1) %out, ptr addrspace ; NOOPT-NEXT: buffer_load_dword v33, off, s[16:19], 0 ; 4-byte Folded Reload ; NOOPT-NEXT: s_mov_b64 exec, s[12:13] ; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_readlane_b32 s0, v33, 9 -; NOOPT-NEXT: v_readlane_b32 s1, v33, 10 +; NOOPT-NEXT: v_readlane_b32 s0, v33, 11 +; NOOPT-NEXT: v_readlane_b32 s1, v33, 12 ; NOOPT-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:4 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:8 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v2, off, s[16:19], 0 offset:12 ; 4-byte Folded Reload @@ -9642,8 +10013,8 @@ define amdgpu_cs void @insert_or_disj_index(ptr addrspace(1) %out, ptr addrspace ; NOOPT-NEXT: buffer_store_dword v14, off, s[16:19], 0 offset:60 ; 4-byte Folded Spill ; NOOPT-NEXT: buffer_store_dword v15, off, s[16:19], 0 offset:64 ; 4-byte Folded Spill ; NOOPT-NEXT: s_mov_b64 s[2:3], s[0:1] -; NOOPT-NEXT: v_writelane_b32 v33, s2, 9 -; NOOPT-NEXT: v_writelane_b32 v33, s3, 10 +; NOOPT-NEXT: v_writelane_b32 v33, s2, 11 +; NOOPT-NEXT: v_writelane_b32 v33, s3, 12 ; NOOPT-NEXT: s_or_saveexec_b64 s[12:13], -1 ; NOOPT-NEXT: buffer_store_dword v33, off, s[16:19], 0 ; 4-byte Folded Spill ; NOOPT-NEXT: s_mov_b64 exec, s[12:13] @@ -9655,18 +10026,20 @@ define amdgpu_cs void @insert_or_disj_index(ptr addrspace(1) %out, ptr addrspace ; NOOPT-NEXT: buffer_load_dword v33, off, s[16:19], 0 ; 4-byte Folded Reload ; NOOPT-NEXT: s_mov_b64 exec, s[12:13] ; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_readlane_b32 s0, v33, 7 -; NOOPT-NEXT: v_readlane_b32 s1, v33, 8 +; NOOPT-NEXT: v_readlane_b32 s0, v33, 9 +; NOOPT-NEXT: v_readlane_b32 s1, v33, 10 ; NOOPT-NEXT: s_mov_b64 exec, s[0:1] ; NOOPT-NEXT: ; %bb.3: ; NOOPT-NEXT: s_or_saveexec_b64 s[12:13], -1 ; NOOPT-NEXT: buffer_load_dword v33, off, s[16:19], 0 ; 4-byte Folded Reload ; NOOPT-NEXT: s_mov_b64 exec, s[12:13] ; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_readlane_b32 s0, v33, 3 -; NOOPT-NEXT: v_readlane_b32 s1, v33, 4 -; NOOPT-NEXT: v_readlane_b32 s2, v33, 5 -; NOOPT-NEXT: v_readlane_b32 s3, v33, 6 +; NOOPT-NEXT: v_readlane_b32 s0, v33, 5 +; NOOPT-NEXT: v_readlane_b32 s1, v33, 6 +; NOOPT-NEXT: v_readlane_b32 s2, v33, 7 +; NOOPT-NEXT: v_readlane_b32 s3, v33, 8 +; NOOPT-NEXT: v_readlane_b32 s8, v33, 3 +; NOOPT-NEXT: v_readlane_b32 s9, v33, 4 ; NOOPT-NEXT: buffer_load_dword v4, off, s[16:19], 0 offset:136 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v5, off, s[16:19], 0 offset:140 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v17, off, s[16:19], 0 offset:148 ; 4-byte Folded Reload @@ -9709,9 +10082,12 @@ define amdgpu_cs void @insert_or_disj_index(ptr addrspace(1) %out, ptr addrspace ; NOOPT-NEXT: v_mov_b32_e32 v18, v13 ; NOOPT-NEXT: v_mov_b32_e32 v19, v12 ; NOOPT-NEXT: v_mov_b32_e32 v20, v11 +; NOOPT-NEXT: s_mov_b64 s[4:5], 32 +; NOOPT-NEXT: ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7 +; NOOPT-NEXT: s_mov_b64 s[6:7], s[8:9] ; NOOPT-NEXT: v_mov_b32_e32 v12, v5 ; NOOPT-NEXT: v_mov_b32_e32 v11, v4 -; NOOPT-NEXT: buffer_store_dwordx4 v[17:20], v[11:12], s[0:3], 0 addr64 offset:48 +; NOOPT-NEXT: buffer_store_dwordx4 v[17:20], v[11:12], s[4:7], 0 addr64 offset:16 ; NOOPT-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11_vgpr12_vgpr13 killed $exec ; NOOPT-NEXT: v_mov_b32_e32 v11, v16 ; NOOPT-NEXT: v_mov_b32_e32 v12, v15 diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll index 98658ded0390..6e6875c84c58 100644 --- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll @@ -14,6 +14,7 @@ define amdgpu_kernel void @extract_w_offset_vgpr(ptr addrspace(1) %out) { ; GCN-NEXT: liveins: $vgpr0, $sgpr4_sgpr5 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY killed $vgpr0 + ; GCN-NEXT: dead early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from constant-pool + 36, align 4, addrspace 4) ; GCN-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset, align 4, addrspace 4) ; GCN-NEXT: renamable $sgpr6 = COPY renamable $sgpr1 ; GCN-NEXT: renamable $sgpr0 = COPY renamable $sgpr0, implicit killed $sgpr0_sgpr1 diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll index c3f391786f87..d895a75de6e8 100644 --- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll @@ -8,10 +8,13 @@ define <2 x i64> @f1() #0 { ; GFX11-LABEL: f1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: v_mov_b32_e32 v3, 0 +; GFX11-NEXT: s_mov_b64 s[0:1], 0 +; GFX11-NEXT: s_mov_b32 s2, s0 +; GFX11-NEXT: s_mov_b32 s3, s1 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: v_mov_b32_e32 v3, s3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ret <2 x i64> zeroinitializer } @@ -20,7 +23,7 @@ define void @f0() { ; GFX11-LABEL: f0: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s2, s33 +; GFX11-NEXT: s_mov_b32 s16, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_store_b32 off, v4, s33 ; 4-byte Folded Spill @@ -41,7 +44,7 @@ define void @f0() { ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_load_b32 v4, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: s_mov_b32 s33, s2 +; GFX11-NEXT: s_mov_b32 s33, s16 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] bb: @@ -52,9 +55,9 @@ bb: define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg4, i1 %arg5, ptr %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10, i1 %arg11) { ; GFX11-LABEL: f2: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_mov_b64 s[16:17], s[4:5] +; GFX11-NEXT: s_mov_b64 s[18:19], s[4:5] ; GFX11-NEXT: v_mov_b32_e32 v31, v0 -; GFX11-NEXT: s_load_b32 s19, s[16:17], 0x24 +; GFX11-NEXT: s_load_b32 s24, s[18:19], 0x24 ; GFX11-NEXT: s_mov_b32 s12, s13 ; GFX11-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX11-NEXT: s_mov_b64 s[6:7], s[2:3] @@ -62,34 +65,34 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg ; GFX11-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX11-NEXT: s_mov_b32 s20, 0 ; GFX11-NEXT: s_mov_b32 s0, -1 -; GFX11-NEXT: s_mov_b32 s3, exec_lo +; GFX11-NEXT: s_mov_b32 s17, exec_lo ; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mul_lo_u32 v0, s19, v0 +; GFX11-NEXT: v_mul_lo_u32 v0, s24, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11-NEXT: s_cbranch_execz .LBB2_13 ; GFX11-NEXT: ; %bb.1: ; %bb14 -; GFX11-NEXT: s_load_b128 s[20:23], s[16:17], 0x2c -; GFX11-NEXT: s_mov_b32 s18, 0 +; GFX11-NEXT: s_load_b128 s[20:23], s[18:19], 0x2c ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_bitcmp1_b32 s21, 0 -; GFX11-NEXT: s_cselect_b32 s24, -1, 0 +; GFX11-NEXT: s_cselect_b32 s25, -1, 0 ; GFX11-NEXT: s_bitcmp0_b32 s21, 0 +; GFX11-NEXT: s_mov_b32 s21, 0 ; GFX11-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX11-NEXT: ; %bb.2: ; %bb15 -; GFX11-NEXT: s_add_u32 s8, s16, 0x58 -; GFX11-NEXT: s_addc_u32 s9, s17, 0 +; GFX11-NEXT: s_add_u32 s8, s18, 0x58 +; GFX11-NEXT: s_addc_u32 s9, s19, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, f0@gotpcrel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, f0@gotpcrel32@hi+12 ; GFX11-NEXT: s_mov_b32 s13, s14 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: s_mov_b32 s21, s14 +; GFX11-NEXT: s_mov_b32 s26, s14 ; GFX11-NEXT: s_mov_b32 s14, s15 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_mov_b32 s14, s21 +; GFX11-NEXT: s_mov_b32 s14, s26 ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: s_cbranch_execz .LBB2_4 ; GFX11-NEXT: s_branch .LBB2_12 @@ -98,18 +101,18 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 ; GFX11-NEXT: s_cbranch_vccnz .LBB2_12 ; GFX11-NEXT: .LBB2_4: ; %bb16 -; GFX11-NEXT: s_load_b32 s0, s[16:17], 0x54 +; GFX11-NEXT: s_load_b32 s0, s[18:19], 0x54 ; GFX11-NEXT: s_bitcmp1_b32 s23, 0 -; GFX11-NEXT: s_cselect_b32 s9, -1, 0 +; GFX11-NEXT: s_cselect_b32 s8, -1, 0 ; GFX11-NEXT: s_and_b32 s1, s23, 1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_bitcmp1_b32 s0, 0 ; GFX11-NEXT: s_mov_b32 s0, -1 -; GFX11-NEXT: s_cselect_b32 s8, -1, 0 +; GFX11-NEXT: s_cselect_b32 s3, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s1, 0 ; GFX11-NEXT: s_cbranch_scc0 .LBB2_8 ; GFX11-NEXT: ; %bb.5: ; %bb18.preheader -; GFX11-NEXT: s_load_b128 s[28:31], s[16:17], 0x44 +; GFX11-NEXT: s_load_b128 s[28:31], s[18:19], 0x44 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_mul_hi_u32 s1, s29, s28 @@ -123,11 +126,11 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg ; GFX11-NEXT: s_mul_i32 s0, s0, s22 ; GFX11-NEXT: s_mul_i32 s0, s0, s20 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_or_b32 s0, s19, s0 -; GFX11-NEXT: s_lshl_b64 s[20:21], s[0:1], 1 +; GFX11-NEXT: s_or_b32 s0, s24, s0 +; GFX11-NEXT: s_lshl_b64 s[22:23], s[0:1], 1 ; GFX11-NEXT: s_mov_b32 s0, s1 -; GFX11-NEXT: global_load_u16 v1, v0, s[20:21] -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s24 +; GFX11-NEXT: global_load_u16 v1, v0, s[22:23] +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s25 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo @@ -136,28 +139,28 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg ; GFX11-NEXT: .LBB2_6: ; %bb18 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX11-NEXT: s_cselect_b32 s13, -1, 0 +; GFX11-NEXT: s_cselect_b32 s9, -1, 0 ; GFX11-NEXT: v_readfirstlane_b32 s1, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s13 -; GFX11-NEXT: s_and_b32 s13, s8, s13 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s9 +; GFX11-NEXT: s_and_b32 s9, s3, s9 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: s_and_b32 s13, s13, exec_lo -; GFX11-NEXT: v_readfirstlane_b32 s19, v2 -; GFX11-NEXT: s_cselect_b32 s1, s19, s1 +; GFX11-NEXT: s_and_b32 s9, s9, exec_lo +; GFX11-NEXT: v_readfirstlane_b32 s13, v2 +; GFX11-NEXT: s_cselect_b32 s1, s13, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s1, s1, 1 -; GFX11-NEXT: s_and_b32 s13, 0xffff, s0 -; GFX11-NEXT: s_cselect_b32 s13, -1, 0 -; GFX11-NEXT: s_and_b32 s20, s9, exec_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s13 -; GFX11-NEXT: v_readfirstlane_b32 s13, v1 +; GFX11-NEXT: s_and_b32 s9, 0xffff, s0 +; GFX11-NEXT: s_cselect_b32 s9, -1, 0 +; GFX11-NEXT: s_and_b32 s16, s8, exec_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s9 +; GFX11-NEXT: v_readfirstlane_b32 s9, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_readfirstlane_b32 s19, v2 -; GFX11-NEXT: s_cselect_b32 s13, s19, s13 -; GFX11-NEXT: s_bitcmp1_b32 s13, 0 -; GFX11-NEXT: s_cselect_b32 s13, 0x100, 0 +; GFX11-NEXT: v_readfirstlane_b32 s13, v2 +; GFX11-NEXT: s_cselect_b32 s9, s13, s9 +; GFX11-NEXT: s_bitcmp1_b32 s9, 0 +; GFX11-NEXT: s_cselect_b32 s9, 0x100, 0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_or_b32 s0, s13, s0 +; GFX11-NEXT: s_or_b32 s0, s9, s0 ; GFX11-NEXT: s_cbranch_vccz .LBB2_6 ; GFX11-NEXT: ; %bb.7: ; %Flow ; GFX11-NEXT: s_mov_b32 s0, 0 @@ -166,24 +169,24 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg ; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, s0 ; GFX11-NEXT: s_cbranch_vccz .LBB2_12 ; GFX11-NEXT: ; %bb.9: -; GFX11-NEXT: s_xor_b32 s0, s8, -1 +; GFX11-NEXT: s_xor_b32 s0, s3, -1 ; GFX11-NEXT: .LBB2_10: ; %bb17 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, s0 ; GFX11-NEXT: s_cbranch_vccz .LBB2_10 ; GFX11-NEXT: ; %bb.11: ; %Flow6 -; GFX11-NEXT: s_mov_b32 s18, -1 +; GFX11-NEXT: s_mov_b32 s21, -1 ; GFX11-NEXT: .LBB2_12: ; %Flow11 ; GFX11-NEXT: s_and_b32 s20, s2, exec_lo -; GFX11-NEXT: s_or_not1_b32 s0, s18, exec_lo +; GFX11-NEXT: s_or_not1_b32 s0, s21, exec_lo ; GFX11-NEXT: .LBB2_13: ; %Flow9 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX11-NEXT: s_and_saveexec_b32 s3, s0 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s17 +; GFX11-NEXT: s_and_saveexec_b32 s17, s0 ; GFX11-NEXT: s_cbranch_execz .LBB2_15 ; GFX11-NEXT: ; %bb.14: ; %bb43 -; GFX11-NEXT: s_add_u32 s8, s16, 0x58 -; GFX11-NEXT: s_addc_u32 s9, s17, 0 +; GFX11-NEXT: s_add_u32 s8, s18, 0x58 +; GFX11-NEXT: s_addc_u32 s9, s19, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, f0@gotpcrel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, f0@gotpcrel32@hi+12 @@ -194,7 +197,7 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_or_b32 s20, s20, exec_lo ; GFX11-NEXT: .LBB2_15: ; %Flow14 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s17 ; GFX11-NEXT: s_and_saveexec_b32 s0, s20 ; GFX11-NEXT: ; %bb.16: ; %UnifiedUnreachableBlock ; GFX11-NEXT: ; divergent unreachable diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll index beeeaa32cacf..03511ec11acd 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll @@ -35,6 +35,11 @@ define amdgpu_kernel void @float4_inselt(ptr addrspace(1) %out, <4 x float> %vec ; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5] ; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-O0-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dword s0, s[2:3], 0x44 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-O0-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 ; GCN-O0-NEXT: s_load_dword s2, s[2:3], 0x44 ; GCN-O0-NEXT: v_mov_b32_e32 v0, 1.0 ; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) @@ -71,6 +76,10 @@ define amdgpu_kernel void @float4_inselt_undef(ptr addrspace(1) %out, i32 %sel) ; GCN-O0-LABEL: float4_inselt_undef: ; GCN-O0: ; %bb.0: ; %entry ; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0x2c +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-O0-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN-O0-NEXT: v_mov_b32_e32 v0, 1.0 ; GCN-O0-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 @@ -120,6 +129,11 @@ define amdgpu_kernel void @int4_inselt(ptr addrspace(1) %out, <4 x i32> %vec, i3 ; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5] ; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-O0-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dword s0, s[2:3], 0x44 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-O0-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 ; GCN-O0-NEXT: s_load_dword s2, s[2:3], 0x44 ; GCN-O0-NEXT: v_mov_b32_e32 v0, 1 ; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) @@ -162,6 +176,12 @@ define amdgpu_kernel void @float2_inselt(ptr addrspace(1) %out, <2 x float> %vec ; GCN-O0: ; %bb.0: ; %entry ; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5] ; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-O0-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x2c ; GCN-O0-NEXT: s_load_dword s2, s[2:3], 0x34 ; GCN-O0-NEXT: v_mov_b32_e32 v0, 1.0 @@ -211,6 +231,11 @@ define amdgpu_kernel void @float8_inselt(ptr addrspace(1) %out, <8 x float> %vec ; GCN-O0-LABEL: float8_inselt: ; GCN-O0: ; %bb.0: ; %entry ; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5] +; GCN-O0-NEXT: s_load_dword s0, s[2:3], 0x64 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-O0-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) ; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-O0-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44 ; GCN-O0-NEXT: s_load_dword s2, s[2:3], 0x64 @@ -315,6 +340,11 @@ define amdgpu_kernel void @float16_inselt(ptr addrspace(1) %out, <16 x float> %v ; GCN-O0-LABEL: float16_inselt: ; GCN-O0: ; %bb.0: ; %entry ; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5] +; GCN-O0-NEXT: s_load_dword s0, s[2:3], 0xa4 +; GCN-O0-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) ; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-O0-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 ; GCN-O0-NEXT: s_load_dword s2, s[2:3], 0xa4 @@ -346,26 +376,6 @@ define amdgpu_kernel void @float16_inselt(ptr addrspace(1) %out, <16 x float> %v ; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 ; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 ; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 48 -; GCN-O0-NEXT: s_mov_b32 s2, s0 -; GCN-O0-NEXT: s_mov_b32 s3, s1 -; GCN-O0-NEXT: s_mov_b32 s5, s6 -; GCN-O0-NEXT: s_mov_b32 s4, s7 -; GCN-O0-NEXT: s_add_u32 s2, s2, s5 -; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 -; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 -; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] -; GCN-O0-NEXT: v_mov_b32_e32 v0, v18 -; GCN-O0-NEXT: v_mov_b32_e32 v1, v17 -; GCN-O0-NEXT: v_mov_b32_e32 v6, v16 -; GCN-O0-NEXT: v_mov_b32_e32 v2, v15 -; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 -; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 -; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 ; GCN-O0-NEXT: s_mov_b64 s[6:7], 32 ; GCN-O0-NEXT: s_mov_b32 s2, s0 ; GCN-O0-NEXT: s_mov_b32 s3, s1 @@ -375,6 +385,26 @@ define amdgpu_kernel void @float16_inselt(ptr addrspace(1) %out, <16 x float> %v ; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 ; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 ; GCN-O0-NEXT: s_mov_b32 s3, s4 +; GCN-O0-NEXT: s_mov_b64 s[6:7], 16 +; GCN-O0-NEXT: s_mov_b32 s4, s2 +; GCN-O0-NEXT: s_mov_b32 s5, s3 +; GCN-O0-NEXT: s_mov_b32 s9, s6 +; GCN-O0-NEXT: s_mov_b32 s8, s7 +; GCN-O0-NEXT: s_add_u32 s4, s4, s9 +; GCN-O0-NEXT: s_addc_u32 s8, s5, s8 +; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GCN-O0-NEXT: s_mov_b32 s5, s8 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 +; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] +; GCN-O0-NEXT: v_mov_b32_e32 v0, v18 +; GCN-O0-NEXT: v_mov_b32_e32 v1, v17 +; GCN-O0-NEXT: v_mov_b32_e32 v6, v16 +; GCN-O0-NEXT: v_mov_b32_e32 v2, v15 +; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 +; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 +; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 ; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 ; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 ; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] @@ -386,7 +416,6 @@ define amdgpu_kernel void @float16_inselt(ptr addrspace(1) %out, <16 x float> %v ; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 ; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 ; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 16 ; GCN-O0-NEXT: s_mov_b32 s2, s0 ; GCN-O0-NEXT: s_mov_b32 s3, s1 ; GCN-O0-NEXT: s_mov_b32 s5, s6 @@ -507,8 +536,24 @@ define amdgpu_kernel void @float32_inselt(ptr addrspace(1) %out, <32 x float> %v ; ; GCN-O0-LABEL: float32_inselt: ; GCN-O0: ; %bb.0: ; %entry +; GCN-O0-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xa4 +; GCN-O0-NEXT: s_mov_b64 s[6:7], 0xa4 +; GCN-O0-NEXT: s_mov_b32 s2, s4 +; GCN-O0-NEXT: s_mov_b32 s0, s5 +; GCN-O0-NEXT: s_mov_b32 s3, s6 +; GCN-O0-NEXT: s_mov_b32 s1, s7 +; GCN-O0-NEXT: s_add_u32 s2, s2, s3 +; GCN-O0-NEXT: s_addc_u32 s0, s0, s1 +; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; GCN-O0-NEXT: s_mov_b32 s3, s0 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dwordx16 s[8:23], s[2:3], 0x40 +; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0x124 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) ; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GCN-O0-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xe4 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-O0-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0x40 ; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) ; GCN-O0-NEXT: s_mov_b32 s2, s51 ; GCN-O0-NEXT: s_mov_b32 s3, s50 @@ -621,66 +666,6 @@ define amdgpu_kernel void @float32_inselt(ptr addrspace(1) %out, <32 x float> %v ; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 ; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 ; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 0x70 -; GCN-O0-NEXT: s_mov_b32 s2, s0 -; GCN-O0-NEXT: s_mov_b32 s3, s1 -; GCN-O0-NEXT: s_mov_b32 s5, s6 -; GCN-O0-NEXT: s_mov_b32 s4, s7 -; GCN-O0-NEXT: s_add_u32 s2, s2, s5 -; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 -; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 -; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] -; GCN-O0-NEXT: v_mov_b32_e32 v0, v34 -; GCN-O0-NEXT: v_mov_b32_e32 v1, v33 -; GCN-O0-NEXT: v_mov_b32_e32 v6, v32 -; GCN-O0-NEXT: v_mov_b32_e32 v2, v31 -; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 -; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 -; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 0x60 -; GCN-O0-NEXT: s_mov_b32 s2, s0 -; GCN-O0-NEXT: s_mov_b32 s3, s1 -; GCN-O0-NEXT: s_mov_b32 s5, s6 -; GCN-O0-NEXT: s_mov_b32 s4, s7 -; GCN-O0-NEXT: s_add_u32 s2, s2, s5 -; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 -; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 -; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] -; GCN-O0-NEXT: v_mov_b32_e32 v0, v30 -; GCN-O0-NEXT: v_mov_b32_e32 v1, v29 -; GCN-O0-NEXT: v_mov_b32_e32 v6, v28 -; GCN-O0-NEXT: v_mov_b32_e32 v2, v27 -; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 -; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 -; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 0x50 -; GCN-O0-NEXT: s_mov_b32 s2, s0 -; GCN-O0-NEXT: s_mov_b32 s3, s1 -; GCN-O0-NEXT: s_mov_b32 s5, s6 -; GCN-O0-NEXT: s_mov_b32 s4, s7 -; GCN-O0-NEXT: s_add_u32 s2, s2, s5 -; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 -; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 -; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] -; GCN-O0-NEXT: v_mov_b32_e32 v0, v26 -; GCN-O0-NEXT: v_mov_b32_e32 v1, v25 -; GCN-O0-NEXT: v_mov_b32_e32 v6, v24 -; GCN-O0-NEXT: v_mov_b32_e32 v2, v23 -; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 -; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 -; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 ; GCN-O0-NEXT: s_mov_b64 s[6:7], 64 ; GCN-O0-NEXT: s_mov_b32 s2, s0 ; GCN-O0-NEXT: s_mov_b32 s3, s1 @@ -690,6 +675,65 @@ define amdgpu_kernel void @float32_inselt(ptr addrspace(1) %out, <32 x float> %v ; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 ; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 ; GCN-O0-NEXT: s_mov_b32 s3, s4 +; GCN-O0-NEXT: s_mov_b64 s[8:9], 32 +; GCN-O0-NEXT: s_mov_b32 s4, s2 +; GCN-O0-NEXT: s_mov_b32 s5, s3 +; GCN-O0-NEXT: s_mov_b32 s7, s8 +; GCN-O0-NEXT: s_mov_b32 s6, s9 +; GCN-O0-NEXT: s_add_u32 s4, s4, s7 +; GCN-O0-NEXT: s_addc_u32 s6, s5, s6 +; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GCN-O0-NEXT: s_mov_b32 s5, s6 +; GCN-O0-NEXT: s_mov_b64 s[6:7], 16 +; GCN-O0-NEXT: s_mov_b32 s10, s4 +; GCN-O0-NEXT: s_mov_b32 s11, s5 +; GCN-O0-NEXT: s_mov_b32 s13, s6 +; GCN-O0-NEXT: s_mov_b32 s12, s7 +; GCN-O0-NEXT: s_add_u32 s10, s10, s13 +; GCN-O0-NEXT: s_addc_u32 s12, s11, s12 +; GCN-O0-NEXT: ; kill: def $sgpr10 killed $sgpr10 def $sgpr10_sgpr11 +; GCN-O0-NEXT: s_mov_b32 s11, s12 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s10 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s11 +; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] +; GCN-O0-NEXT: v_mov_b32_e32 v0, v34 +; GCN-O0-NEXT: v_mov_b32_e32 v1, v33 +; GCN-O0-NEXT: v_mov_b32_e32 v6, v32 +; GCN-O0-NEXT: v_mov_b32_e32 v2, v31 +; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 +; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 +; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 +; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] +; GCN-O0-NEXT: v_mov_b32_e32 v0, v30 +; GCN-O0-NEXT: v_mov_b32_e32 v1, v29 +; GCN-O0-NEXT: v_mov_b32_e32 v6, v28 +; GCN-O0-NEXT: v_mov_b32_e32 v2, v27 +; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 +; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 +; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 +; GCN-O0-NEXT: s_mov_b32 s4, s2 +; GCN-O0-NEXT: s_mov_b32 s5, s3 +; GCN-O0-NEXT: s_mov_b32 s11, s6 +; GCN-O0-NEXT: s_mov_b32 s10, s7 +; GCN-O0-NEXT: s_add_u32 s4, s4, s11 +; GCN-O0-NEXT: s_addc_u32 s10, s5, s10 +; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GCN-O0-NEXT: s_mov_b32 s5, s10 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 +; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] +; GCN-O0-NEXT: v_mov_b32_e32 v0, v26 +; GCN-O0-NEXT: v_mov_b32_e32 v1, v25 +; GCN-O0-NEXT: v_mov_b32_e32 v6, v24 +; GCN-O0-NEXT: v_mov_b32_e32 v2, v23 +; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 +; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 +; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 ; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 ; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 ; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] @@ -701,17 +745,24 @@ define amdgpu_kernel void @float32_inselt(ptr addrspace(1) %out, <32 x float> %v ; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 ; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 ; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 48 ; GCN-O0-NEXT: s_mov_b32 s2, s0 ; GCN-O0-NEXT: s_mov_b32 s3, s1 -; GCN-O0-NEXT: s_mov_b32 s5, s6 -; GCN-O0-NEXT: s_mov_b32 s4, s7 +; GCN-O0-NEXT: s_mov_b32 s5, s8 +; GCN-O0-NEXT: s_mov_b32 s4, s9 ; GCN-O0-NEXT: s_add_u32 s2, s2, s5 ; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 ; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 ; GCN-O0-NEXT: s_mov_b32 s3, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: s_mov_b32 s4, s2 +; GCN-O0-NEXT: s_mov_b32 s5, s3 +; GCN-O0-NEXT: s_mov_b32 s9, s6 +; GCN-O0-NEXT: s_mov_b32 s8, s7 +; GCN-O0-NEXT: s_add_u32 s4, s4, s9 +; GCN-O0-NEXT: s_addc_u32 s8, s5, s8 +; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GCN-O0-NEXT: s_mov_b32 s5, s8 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 ; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; GCN-O0-NEXT: v_mov_b32_e32 v0, v18 ; GCN-O0-NEXT: v_mov_b32_e32 v1, v17 @@ -721,15 +772,6 @@ define amdgpu_kernel void @float32_inselt(ptr addrspace(1) %out, <32 x float> %v ; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 ; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 ; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 32 -; GCN-O0-NEXT: s_mov_b32 s2, s0 -; GCN-O0-NEXT: s_mov_b32 s3, s1 -; GCN-O0-NEXT: s_mov_b32 s5, s6 -; GCN-O0-NEXT: s_mov_b32 s4, s7 -; GCN-O0-NEXT: s_add_u32 s2, s2, s5 -; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 -; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s4 ; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 ; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 ; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] @@ -741,7 +783,6 @@ define amdgpu_kernel void @float32_inselt(ptr addrspace(1) %out, <32 x float> %v ; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 ; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 ; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 16 ; GCN-O0-NEXT: s_mov_b32 s2, s0 ; GCN-O0-NEXT: s_mov_b32 s3, s1 ; GCN-O0-NEXT: s_mov_b32 s5, s6 @@ -795,19 +836,25 @@ define amdgpu_kernel void @half4_inselt(ptr addrspace(1) %out, <4 x half> %vec, ; GCN-O0: ; %bb.0: ; %entry ; GCN-O0-NEXT: s_mov_b64 s[0:1], s[4:5] ; GCN-O0-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN-O0-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; GCN-O0-NEXT: s_load_dword s6, s[0:1], 0x34 -; GCN-O0-NEXT: s_mov_b32 s7, 0x3c003c00 -; GCN-O0-NEXT: s_mov_b32 s0, s7 -; GCN-O0-NEXT: s_mov_b32 s1, s7 ; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: s_xor_b64 s[0:1], s[4:5], s[0:1] -; GCN-O0-NEXT: s_mov_b32 s7, 4 -; GCN-O0-NEXT: s_lshl_b32 s8, s6, s7 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 0xffff -; GCN-O0-NEXT: s_lshl_b64 s[6:7], s[6:7], s8 +; GCN-O0-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dword s2, s[0:1], 0x34 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-O0-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GCN-O0-NEXT: s_load_dword s0, s[0:1], 0x34 +; GCN-O0-NEXT: s_mov_b32 s1, 4 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_lshl_b32 s6, s0, s1 +; GCN-O0-NEXT: s_mov_b64 s[0:1], 0xffff +; GCN-O0-NEXT: s_lshl_b64 s[0:1], s[0:1], s6 +; GCN-O0-NEXT: s_andn2_b64 s[4:5], s[4:5], s[0:1] +; GCN-O0-NEXT: s_mov_b32 s8, 0x3c003c00 +; GCN-O0-NEXT: s_mov_b32 s6, s8 +; GCN-O0-NEXT: s_mov_b32 s7, s8 ; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] -; GCN-O0-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] +; GCN-O0-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] ; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 ; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 ; GCN-O0-NEXT: v_mov_b32_e32 v3, s1 @@ -838,18 +885,24 @@ define amdgpu_kernel void @half2_inselt(ptr addrspace(1) %out, <2 x half> %vec, ; ; GCN-O0-LABEL: half2_inselt: ; GCN-O0: ; %bb.0: ; %entry +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0x2c +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0x30 ; GCN-O0-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GCN-O0-NEXT: s_load_dword s1, s[4:5], 0x2c -; GCN-O0-NEXT: s_load_dword s4, s[4:5], 0x30 -; GCN-O0-NEXT: s_mov_b32 s0, 0x3c003c00 ; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: s_xor_b32 s0, s1, s0 -; GCN-O0-NEXT: s_mov_b32 s5, 4 -; GCN-O0-NEXT: s_lshl_b32 s5, s4, s5 -; GCN-O0-NEXT: s_mov_b32 s4, 0xffff -; GCN-O0-NEXT: s_lshl_b32 s4, s4, s5 +; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0x30 +; GCN-O0-NEXT: s_mov_b32 s4, 4 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_lshl_b32 s4, s0, s4 +; GCN-O0-NEXT: s_mov_b32 s0, 0xffff +; GCN-O0-NEXT: s_lshl_b32 s0, s0, s4 +; GCN-O0-NEXT: s_andn2_b32 s1, s1, s0 +; GCN-O0-NEXT: s_mov_b32 s4, 0x3c003c00 ; GCN-O0-NEXT: s_and_b32 s0, s0, s4 -; GCN-O0-NEXT: s_xor_b32 s0, s0, s1 +; GCN-O0-NEXT: s_or_b32 s0, s0, s1 ; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 ; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 ; GCN-O0-NEXT: v_mov_b32_e32 v2, s0 @@ -928,39 +981,47 @@ define amdgpu_kernel void @half8_inselt(ptr addrspace(1) %out, <8 x half> %vec, ; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5] ; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-O0-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dword s0, s[2:3], 0x44 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-O0-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 ; GCN-O0-NEXT: s_load_dword s2, s[2:3], 0x44 ; GCN-O0-NEXT: s_mov_b32 s3, 7 ; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) ; GCN-O0-NEXT: s_and_b32 s2, s2, s3 -; GCN-O0-NEXT: s_mov_b32 s3, 1 -; GCN-O0-NEXT: s_lshl_b32 s3, s2, s3 +; GCN-O0-NEXT: s_mov_b32 s3, 2 +; GCN-O0-NEXT: s_mul_i32 s3, s2, s3 ; GCN-O0-NEXT: s_mov_b32 s2, 0 -; GCN-O0-NEXT: s_or_b32 s2, s2, s3 -; GCN-O0-NEXT: s_mov_b32 s3, s7 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s3 +; GCN-O0-NEXT: s_add_i32 s3, s2, s3 +; GCN-O0-NEXT: s_mov_b32 s8, s7 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s8 ; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:12 -; GCN-O0-NEXT: s_mov_b32 s3, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s3 +; GCN-O0-NEXT: s_mov_b32 s8, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s8 ; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:8 -; GCN-O0-NEXT: s_mov_b32 s3, s5 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s3 +; GCN-O0-NEXT: s_mov_b32 s8, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s8 ; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 -; GCN-O0-NEXT: s_mov_b32 s3, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s3 +; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5_sgpr6_sgpr7 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 ; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; GCN-O0-NEXT: v_mov_b32_e32 v0, 0x3c00 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 ; GCN-O0-NEXT: buffer_store_short v0, v1, s[12:15], 0 offen +; GCN-O0-NEXT: s_mov_b32 s3, 4 +; GCN-O0-NEXT: s_add_i32 s2, s2, s3 +; GCN-O0-NEXT: s_add_i32 s3, s2, s3 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s3 +; GCN-O0-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen offset:4 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s2 +; GCN-O0-NEXT: buffer_load_dword v1, v1, s[12:15], 0 offen offset:4 ; GCN-O0-NEXT: buffer_load_dword v2, off, s[12:15], 0 ; GCN-O0-NEXT: buffer_load_dword v6, off, s[12:15], 0 offset:4 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:12 ; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec -; GCN-O0-NEXT: s_waitcnt vmcnt(2) -; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 -; GCN-O0-NEXT: s_waitcnt vmcnt(1) -; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 ; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 +; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 ; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 ; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 ; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 @@ -990,18 +1051,24 @@ define amdgpu_kernel void @short2_inselt(ptr addrspace(1) %out, <2 x i16> %vec, ; ; GCN-O0-LABEL: short2_inselt: ; GCN-O0: ; %bb.0: ; %entry +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0x2c +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0x30 ; GCN-O0-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GCN-O0-NEXT: s_load_dword s1, s[4:5], 0x2c -; GCN-O0-NEXT: s_load_dword s4, s[4:5], 0x30 -; GCN-O0-NEXT: s_mov_b32 s0, 0x10001 ; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: s_xor_b32 s0, s1, s0 -; GCN-O0-NEXT: s_mov_b32 s5, 4 -; GCN-O0-NEXT: s_lshl_b32 s5, s4, s5 -; GCN-O0-NEXT: s_mov_b32 s4, 0xffff -; GCN-O0-NEXT: s_lshl_b32 s4, s4, s5 +; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0x30 +; GCN-O0-NEXT: s_mov_b32 s4, 4 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_lshl_b32 s4, s0, s4 +; GCN-O0-NEXT: s_mov_b32 s0, 0xffff +; GCN-O0-NEXT: s_lshl_b32 s0, s0, s4 +; GCN-O0-NEXT: s_andn2_b32 s1, s1, s0 +; GCN-O0-NEXT: s_mov_b32 s4, 0x10001 ; GCN-O0-NEXT: s_and_b32 s0, s0, s4 -; GCN-O0-NEXT: s_xor_b32 s0, s0, s1 +; GCN-O0-NEXT: s_or_b32 s0, s0, s1 ; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 ; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 ; GCN-O0-NEXT: v_mov_b32_e32 v2, s0 @@ -1037,19 +1104,25 @@ define amdgpu_kernel void @short4_inselt(ptr addrspace(1) %out, <4 x i16> %vec, ; GCN-O0: ; %bb.0: ; %entry ; GCN-O0-NEXT: s_mov_b64 s[0:1], s[4:5] ; GCN-O0-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN-O0-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; GCN-O0-NEXT: s_load_dword s6, s[0:1], 0x34 -; GCN-O0-NEXT: s_mov_b32 s7, 0x10001 -; GCN-O0-NEXT: s_mov_b32 s0, s7 -; GCN-O0-NEXT: s_mov_b32 s1, s7 ; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: s_xor_b64 s[0:1], s[4:5], s[0:1] -; GCN-O0-NEXT: s_mov_b32 s7, 4 -; GCN-O0-NEXT: s_lshl_b32 s8, s6, s7 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 0xffff -; GCN-O0-NEXT: s_lshl_b64 s[6:7], s[6:7], s8 +; GCN-O0-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dword s2, s[0:1], 0x34 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-O0-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GCN-O0-NEXT: s_load_dword s0, s[0:1], 0x34 +; GCN-O0-NEXT: s_mov_b32 s1, 4 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_lshl_b32 s6, s0, s1 +; GCN-O0-NEXT: s_mov_b64 s[0:1], 0xffff +; GCN-O0-NEXT: s_lshl_b64 s[0:1], s[0:1], s6 +; GCN-O0-NEXT: s_andn2_b64 s[4:5], s[4:5], s[0:1] +; GCN-O0-NEXT: s_mov_b32 s8, 0x10001 +; GCN-O0-NEXT: s_mov_b32 s6, s8 +; GCN-O0-NEXT: s_mov_b32 s7, s8 ; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] -; GCN-O0-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] +; GCN-O0-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] ; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 ; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 ; GCN-O0-NEXT: v_mov_b32_e32 v3, s1 @@ -1083,135 +1156,305 @@ define amdgpu_kernel void @byte8_inselt(ptr addrspace(1) %out, <8 x i8> %vec, i3 ; ; GCN-O0-LABEL: byte8_inselt: ; GCN-O0: ; %bb.0: ; %entry -; GCN-O0-NEXT: s_mov_b64 s[0:1], s[4:5] -; GCN-O0-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN-O0-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GCN-O0-NEXT: s_load_dword s6, s[0:1], 0x34 -; GCN-O0-NEXT: s_mov_b32 s7, 0x1010101 -; GCN-O0-NEXT: s_mov_b32 s0, s7 -; GCN-O0-NEXT: s_mov_b32 s1, s7 +; GCN-O0-NEXT: s_mov_b64 s[12:13], s[4:5] +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[12:13], 0x24 ; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1] -; GCN-O0-NEXT: s_mov_b32 s7, 3 -; GCN-O0-NEXT: s_lshl_b32 s8, s6, s7 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 0xff -; GCN-O0-NEXT: s_lshl_b64 s[6:7], s[6:7], s8 -; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] -; GCN-O0-NEXT: s_xor_b64 s[10:11], s[0:1], s[2:3] -; GCN-O0-NEXT: s_mov_b32 s3, s10 -; GCN-O0-NEXT: s_mov_b32 s0, 8 -; GCN-O0-NEXT: s_lshr_b32 s0, s3, s0 -; GCN-O0-NEXT: s_mov_b32 s1, s10 -; GCN-O0-NEXT: s_mov_b32 s2, 16 -; GCN-O0-NEXT: s_lshr_b32 s2, s3, s2 -; GCN-O0-NEXT: s_mov_b32 s6, 24 -; GCN-O0-NEXT: s_lshr_b32 s3, s3, s6 -; GCN-O0-NEXT: s_mov_b32 s6, 32 -; GCN-O0-NEXT: s_lshr_b64 s[6:7], s[10:11], s6 -; GCN-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 killed $sgpr6_sgpr7 -; GCN-O0-NEXT: s_mov_b32 s7, 40 -; GCN-O0-NEXT: s_lshr_b64 s[8:9], s[10:11], s7 -; GCN-O0-NEXT: s_mov_b32 s7, s8 -; GCN-O0-NEXT: s_mov_b32 s8, 48 -; GCN-O0-NEXT: s_lshr_b64 s[8:9], s[10:11], s8 -; GCN-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9 -; GCN-O0-NEXT: s_mov_b32 s9, 56 -; GCN-O0-NEXT: s_lshr_b64 s[10:11], s[10:11], s9 -; GCN-O0-NEXT: s_mov_b32 s9, s10 -; GCN-O0-NEXT: s_mov_b64 s[14:15], 7 -; GCN-O0-NEXT: s_mov_b32 s10, s4 -; GCN-O0-NEXT: s_mov_b32 s11, s5 -; GCN-O0-NEXT: s_mov_b32 s13, s14 -; GCN-O0-NEXT: s_mov_b32 s12, s15 -; GCN-O0-NEXT: s_add_u32 s10, s10, s13 -; GCN-O0-NEXT: s_addc_u32 s12, s11, s12 -; GCN-O0-NEXT: ; kill: def $sgpr10 killed $sgpr10 def $sgpr10_sgpr11 -; GCN-O0-NEXT: s_mov_b32 s11, s12 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s10 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s11 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s9 -; GCN-O0-NEXT: flat_store_byte v[0:1], v2 -; GCN-O0-NEXT: s_mov_b64 s[14:15], 6 -; GCN-O0-NEXT: s_mov_b32 s10, s4 -; GCN-O0-NEXT: s_mov_b32 s9, s5 -; GCN-O0-NEXT: s_mov_b32 s12, s14 -; GCN-O0-NEXT: s_mov_b32 s11, s15 -; GCN-O0-NEXT: s_add_u32 s10, s10, s12 -; GCN-O0-NEXT: s_addc_u32 s9, s9, s11 -; GCN-O0-NEXT: ; kill: def $sgpr10 killed $sgpr10 def $sgpr10_sgpr11 -; GCN-O0-NEXT: s_mov_b32 s11, s9 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s10 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s11 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s8 -; GCN-O0-NEXT: flat_store_byte v[0:1], v2 -; GCN-O0-NEXT: s_mov_b64 s[12:13], 5 -; GCN-O0-NEXT: s_mov_b32 s8, s4 -; GCN-O0-NEXT: s_mov_b32 s9, s5 -; GCN-O0-NEXT: s_mov_b32 s11, s12 -; GCN-O0-NEXT: s_mov_b32 s10, s13 -; GCN-O0-NEXT: s_add_u32 s8, s8, s11 -; GCN-O0-NEXT: s_addc_u32 s10, s9, s10 -; GCN-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9 -; GCN-O0-NEXT: s_mov_b32 s9, s10 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s8 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s9 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s7 -; GCN-O0-NEXT: flat_store_byte v[0:1], v2 -; GCN-O0-NEXT: s_mov_b64 s[12:13], 4 -; GCN-O0-NEXT: s_mov_b32 s8, s4 -; GCN-O0-NEXT: s_mov_b32 s7, s5 -; GCN-O0-NEXT: s_mov_b32 s10, s12 -; GCN-O0-NEXT: s_mov_b32 s9, s13 -; GCN-O0-NEXT: s_add_u32 s8, s8, s10 -; GCN-O0-NEXT: s_addc_u32 s7, s7, s9 -; GCN-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9 -; GCN-O0-NEXT: s_mov_b32 s9, s7 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s8 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s9 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s6 -; GCN-O0-NEXT: flat_store_byte v[0:1], v2 -; GCN-O0-NEXT: s_mov_b64 s[10:11], 3 -; GCN-O0-NEXT: s_mov_b32 s6, s4 -; GCN-O0-NEXT: s_mov_b32 s7, s5 -; GCN-O0-NEXT: s_mov_b32 s9, s10 -; GCN-O0-NEXT: s_mov_b32 s8, s11 -; GCN-O0-NEXT: s_add_u32 s6, s6, s9 -; GCN-O0-NEXT: s_addc_u32 s8, s7, s8 -; GCN-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 -; GCN-O0-NEXT: s_mov_b32 s7, s8 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s7 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s3 -; GCN-O0-NEXT: flat_store_byte v[0:1], v2 -; GCN-O0-NEXT: s_mov_b64 s[10:11], 2 -; GCN-O0-NEXT: s_mov_b32 s6, s4 -; GCN-O0-NEXT: s_mov_b32 s3, s5 -; GCN-O0-NEXT: s_mov_b32 s8, s10 -; GCN-O0-NEXT: s_mov_b32 s7, s11 -; GCN-O0-NEXT: s_add_u32 s6, s6, s8 -; GCN-O0-NEXT: s_addc_u32 s3, s3, s7 -; GCN-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 -; GCN-O0-NEXT: s_mov_b32 s7, s3 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s7 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s2 -; GCN-O0-NEXT: flat_store_byte v[0:1], v2 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s1 -; GCN-O0-NEXT: flat_store_byte v[0:1], v2 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 1 -; GCN-O0-NEXT: s_mov_b32 s2, s4 +; GCN-O0-NEXT: s_load_dword s0, s[12:13], 0x2c +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dword s0, s[12:13], 0x30 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dword s0, s[12:13], 0x34 +; GCN-O0-NEXT: s_mov_b64 s[4:5], 48 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_mov_b32 s0, s12 +; GCN-O0-NEXT: s_mov_b32 s1, s13 +; GCN-O0-NEXT: s_mov_b32 s3, s4 +; GCN-O0-NEXT: s_mov_b32 s2, s5 +; GCN-O0-NEXT: s_add_u32 s0, s0, s3 +; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 +; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] +; GCN-O0-NEXT: s_mov_b64 s[4:5], 44 +; GCN-O0-NEXT: s_mov_b32 s2, s12 +; GCN-O0-NEXT: s_mov_b32 s0, s13 +; GCN-O0-NEXT: s_mov_b32 s3, s4 ; GCN-O0-NEXT: s_mov_b32 s1, s5 -; GCN-O0-NEXT: s_mov_b32 s4, s6 -; GCN-O0-NEXT: s_mov_b32 s3, s7 -; GCN-O0-NEXT: s_add_u32 s2, s2, s4 +; GCN-O0-NEXT: s_add_u32 s18, s2, s3 +; GCN-O0-NEXT: s_addc_u32 s0, s0, s1 +; GCN-O0-NEXT: ; kill: def $sgpr18 killed $sgpr18 def $sgpr18_sgpr19 +; GCN-O0-NEXT: s_mov_b32 s19, s0 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_mov_b32_e32 v0, s18 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s19 +; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] +; GCN-O0-NEXT: s_mov_b64 s[4:5], 1 +; GCN-O0-NEXT: s_mov_b32 s0, s18 +; GCN-O0-NEXT: s_mov_b32 s1, s19 +; GCN-O0-NEXT: s_mov_b32 s3, s4 +; GCN-O0-NEXT: s_mov_b32 s2, s5 +; GCN-O0-NEXT: s_add_u32 s0, s0, s3 +; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s2 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 +; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] +; GCN-O0-NEXT: s_mov_b32 s2, 8 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_lshlrev_b32_e64 v1, s2, v0 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s18 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s19 +; GCN-O0-NEXT: flat_load_ubyte v0, v[2:3] +; GCN-O0-NEXT: s_mov_b32 s0, 0xff +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v0, v0, s0 +; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 +; GCN-O0-NEXT: s_mov_b32 s10, 0xffff +; GCN-O0-NEXT: v_and_b32_e64 v0, s10, v0 +; GCN-O0-NEXT: s_mov_b64 s[8:9], 2 +; GCN-O0-NEXT: s_mov_b32 s6, s18 +; GCN-O0-NEXT: s_mov_b32 s1, s19 +; GCN-O0-NEXT: s_mov_b32 s7, s8 +; GCN-O0-NEXT: s_mov_b32 s3, s9 +; GCN-O0-NEXT: s_add_u32 s14, s6, s7 ; GCN-O0-NEXT: s_addc_u32 s1, s1, s3 +; GCN-O0-NEXT: ; kill: def $sgpr14 killed $sgpr14 def $sgpr14_sgpr15 +; GCN-O0-NEXT: s_mov_b32 s15, s1 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s14 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s15 +; GCN-O0-NEXT: flat_load_ubyte v1, v[1:2] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v1, v1, s0 +; GCN-O0-NEXT: s_mov_b32 s6, s14 +; GCN-O0-NEXT: s_mov_b32 s1, s15 +; GCN-O0-NEXT: s_mov_b32 s7, s4 +; GCN-O0-NEXT: s_mov_b32 s3, s5 +; GCN-O0-NEXT: s_add_u32 s6, s6, s7 +; GCN-O0-NEXT: s_addc_u32 s1, s1, s3 +; GCN-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GCN-O0-NEXT: s_mov_b32 s7, s1 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s7 +; GCN-O0-NEXT: flat_load_ubyte v2, v[2:3] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_lshlrev_b32_e64 v2, s2, v2 +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GCN-O0-NEXT: s_mov_b32 s3, 16 +; GCN-O0-NEXT: v_lshlrev_b32_e64 v1, s3, v1 +; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 +; GCN-O0-NEXT: s_mov_b32 s17, 0 +; GCN-O0-NEXT: v_mov_b32_e32 v2, 0 +; GCN-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v1, v2 +; GCN-O0-NEXT: s_mov_b64 s[6:7], 4 +; GCN-O0-NEXT: s_mov_b32 s14, s18 +; GCN-O0-NEXT: s_mov_b32 s1, s19 +; GCN-O0-NEXT: s_mov_b32 s15, s6 +; GCN-O0-NEXT: s_mov_b32 s11, s7 +; GCN-O0-NEXT: s_add_u32 s18, s14, s15 +; GCN-O0-NEXT: s_addc_u32 s1, s1, s11 +; GCN-O0-NEXT: ; kill: def $sgpr18 killed $sgpr18 def $sgpr18_sgpr19 +; GCN-O0-NEXT: s_mov_b32 s19, s1 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s18 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s19 +; GCN-O0-NEXT: flat_load_ubyte v2, v[2:3] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v2, v2, s0 +; GCN-O0-NEXT: s_mov_b32 s14, s18 +; GCN-O0-NEXT: s_mov_b32 s1, s19 +; GCN-O0-NEXT: s_mov_b32 s15, s4 +; GCN-O0-NEXT: s_mov_b32 s11, s5 +; GCN-O0-NEXT: s_add_u32 s14, s14, s15 +; GCN-O0-NEXT: s_addc_u32 s1, s1, s11 +; GCN-O0-NEXT: ; kill: def $sgpr14 killed $sgpr14 def $sgpr14_sgpr15 +; GCN-O0-NEXT: s_mov_b32 s15, s1 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s14 +; GCN-O0-NEXT: v_mov_b32_e32 v4, s15 +; GCN-O0-NEXT: flat_load_ubyte v3, v[3:4] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_lshlrev_b32_e64 v3, s2, v3 +; GCN-O0-NEXT: v_or_b32_e64 v2, v2, v3 +; GCN-O0-NEXT: v_and_b32_e64 v2, s10, v2 +; GCN-O0-NEXT: s_mov_b32 s14, s18 +; GCN-O0-NEXT: s_mov_b32 s1, s19 +; GCN-O0-NEXT: s_mov_b32 s15, s8 +; GCN-O0-NEXT: s_mov_b32 s11, s9 +; GCN-O0-NEXT: s_add_u32 s14, s14, s15 +; GCN-O0-NEXT: s_addc_u32 s1, s1, s11 +; GCN-O0-NEXT: ; kill: def $sgpr14 killed $sgpr14 def $sgpr14_sgpr15 +; GCN-O0-NEXT: s_mov_b32 s15, s1 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s14 +; GCN-O0-NEXT: v_mov_b32_e32 v4, s15 +; GCN-O0-NEXT: flat_load_ubyte v3, v[3:4] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v3, v3, s0 +; GCN-O0-NEXT: s_mov_b32 s0, s14 +; GCN-O0-NEXT: s_mov_b32 s1, s15 +; GCN-O0-NEXT: s_mov_b32 s14, s4 +; GCN-O0-NEXT: s_mov_b32 s11, s5 +; GCN-O0-NEXT: s_add_u32 s0, s0, s14 +; GCN-O0-NEXT: s_addc_u32 s11, s1, s11 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s11 +; GCN-O0-NEXT: v_mov_b32_e32 v5, s1 +; GCN-O0-NEXT: v_mov_b32_e32 v4, s0 +; GCN-O0-NEXT: flat_load_ubyte v4, v[4:5] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_lshlrev_b32_e64 v4, s2, v4 +; GCN-O0-NEXT: v_or_b32_e64 v3, v3, v4 +; GCN-O0-NEXT: v_lshlrev_b32_e64 v3, s3, v3 +; GCN-O0-NEXT: v_or_b32_e64 v2, v2, v3 +; GCN-O0-NEXT: ; implicit-def: $sgpr0 +; GCN-O0-NEXT: ; implicit-def: $sgpr1 +; GCN-O0-NEXT: v_mov_b32_e32 v4, s0 +; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v3, v4 +; GCN-O0-NEXT: s_mov_b32 s11, 32 +; GCN-O0-NEXT: v_lshlrev_b64 v[4:5], s11, v[2:3] +; GCN-O0-NEXT: v_mov_b32_e32 v2, v0 +; GCN-O0-NEXT: v_mov_b32_e32 v3, v4 +; GCN-O0-NEXT: v_mov_b32_e32 v0, v1 +; GCN-O0-NEXT: v_mov_b32_e32 v1, v5 +; GCN-O0-NEXT: v_or_b32_e64 v2, v2, v3 +; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 +; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v3, v0 +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[12:13], 0x24 +; GCN-O0-NEXT: s_load_dword s12, s[12:13], 0x34 +; GCN-O0-NEXT: s_mov_b32 s13, 3 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_lshl_b32 s14, s12, s13 +; GCN-O0-NEXT: s_mov_b64 s[12:13], 0xff +; GCN-O0-NEXT: s_lshl_b64 s[12:13], s[12:13], s14 +; GCN-O0-NEXT: v_mov_b32_e32 v1, v2 +; GCN-O0-NEXT: s_mov_b32 s15, s12 +; GCN-O0-NEXT: v_mov_b32_e32 v0, v3 +; GCN-O0-NEXT: s_mov_b32 s14, s13 +; GCN-O0-NEXT: v_not_b32_e32 v2, s15 +; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v2 +; GCN-O0-NEXT: v_not_b32_e32 v2, s14 +; GCN-O0-NEXT: v_and_b32_e64 v0, v0, v2 +; GCN-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v2, v0 +; GCN-O0-NEXT: s_mov_b32 s14, 0x100 +; GCN-O0-NEXT: s_mov_b32 s15, 1 +; GCN-O0-NEXT: s_or_b32 s15, s14, s15 +; GCN-O0-NEXT: s_and_b32 s14, s10, s15 +; GCN-O0-NEXT: s_lshl_b32 s15, s15, s3 +; GCN-O0-NEXT: s_or_b32 s16, s14, s15 +; GCN-O0-NEXT: s_mov_b32 s14, s16 +; GCN-O0-NEXT: s_mov_b32 s15, s17 +; GCN-O0-NEXT: ; implicit-def: $sgpr18 +; GCN-O0-NEXT: ; implicit-def: $sgpr17 +; GCN-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17 +; GCN-O0-NEXT: s_mov_b32 s17, s18 +; GCN-O0-NEXT: s_lshl_b64 s[16:17], s[16:17], s11 +; GCN-O0-NEXT: s_or_b64 s[14:15], s[14:15], s[16:17] +; GCN-O0-NEXT: s_and_b64 s[14:15], s[12:13], s[14:15] +; GCN-O0-NEXT: s_mov_b32 s13, s14 +; GCN-O0-NEXT: v_mov_b32_e32 v0, v1 +; GCN-O0-NEXT: s_mov_b32 s12, s15 +; GCN-O0-NEXT: v_mov_b32_e32 v1, v2 +; GCN-O0-NEXT: v_or_b32_e64 v0, s13, v0 +; GCN-O0-NEXT: v_or_b32_e64 v2, s12, v1 +; GCN-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v1, v2 +; GCN-O0-NEXT: v_lshrrev_b64 v[4:5], s11, v[0:1] +; GCN-O0-NEXT: v_mov_b32_e32 v3, v4 +; GCN-O0-NEXT: v_and_b32_e64 v2, s10, v3 +; GCN-O0-NEXT: v_lshrrev_b32_e64 v2, s2, v2 +; GCN-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec +; GCN-O0-NEXT: v_lshrrev_b32_e64 v5, s3, v4 +; GCN-O0-NEXT: v_lshrrev_b32_e64 v4, s2, v5 +; GCN-O0-NEXT: v_mov_b32_e32 v7, v0 +; GCN-O0-NEXT: v_and_b32_e64 v6, s10, v7 +; GCN-O0-NEXT: v_lshrrev_b32_e64 v6, s2, v6 +; GCN-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GCN-O0-NEXT: v_lshrrev_b32_e64 v9, s3, v0 +; GCN-O0-NEXT: v_lshrrev_b32_e64 v8, s2, v9 +; GCN-O0-NEXT: s_mov_b32 s10, s0 +; GCN-O0-NEXT: s_mov_b32 s2, s1 +; GCN-O0-NEXT: s_mov_b32 s11, s8 +; GCN-O0-NEXT: s_mov_b32 s3, s9 +; GCN-O0-NEXT: s_add_u32 s10, s10, s11 +; GCN-O0-NEXT: s_addc_u32 s2, s2, s3 +; GCN-O0-NEXT: ; kill: def $sgpr10 killed $sgpr10 def $sgpr10_sgpr11 +; GCN-O0-NEXT: s_mov_b32 s11, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s10 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s11 +; GCN-O0-NEXT: flat_store_byte v[0:1], v9 +; GCN-O0-NEXT: s_mov_b32 s2, s10 +; GCN-O0-NEXT: s_mov_b32 s3, s11 +; GCN-O0-NEXT: s_mov_b32 s11, s4 +; GCN-O0-NEXT: s_mov_b32 s10, s5 +; GCN-O0-NEXT: s_add_u32 s2, s2, s11 +; GCN-O0-NEXT: s_addc_u32 s10, s3, s10 ; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s1 +; GCN-O0-NEXT: s_mov_b32 s3, s10 ; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 ; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s0 +; GCN-O0-NEXT: flat_store_byte v[0:1], v8 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 +; GCN-O0-NEXT: flat_store_byte v[0:1], v7 +; GCN-O0-NEXT: s_mov_b32 s2, s0 +; GCN-O0-NEXT: s_mov_b32 s3, s1 +; GCN-O0-NEXT: s_mov_b32 s11, s4 +; GCN-O0-NEXT: s_mov_b32 s10, s5 +; GCN-O0-NEXT: s_add_u32 s2, s2, s11 +; GCN-O0-NEXT: s_addc_u32 s10, s3, s10 +; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; GCN-O0-NEXT: s_mov_b32 s3, s10 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: flat_store_byte v[0:1], v6 +; GCN-O0-NEXT: s_mov_b32 s2, s0 +; GCN-O0-NEXT: s_mov_b32 s0, s1 +; GCN-O0-NEXT: s_mov_b32 s3, s6 +; GCN-O0-NEXT: s_mov_b32 s1, s7 +; GCN-O0-NEXT: s_add_u32 s2, s2, s3 +; GCN-O0-NEXT: s_addc_u32 s0, s0, s1 +; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; GCN-O0-NEXT: s_mov_b32 s3, s0 +; GCN-O0-NEXT: s_mov_b32 s6, s2 +; GCN-O0-NEXT: s_mov_b32 s0, s3 +; GCN-O0-NEXT: s_mov_b32 s7, s8 +; GCN-O0-NEXT: s_mov_b32 s1, s9 +; GCN-O0-NEXT: s_add_u32 s6, s6, s7 +; GCN-O0-NEXT: s_addc_u32 s0, s0, s1 +; GCN-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GCN-O0-NEXT: s_mov_b32 s7, s0 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s7 +; GCN-O0-NEXT: flat_store_byte v[0:1], v5 +; GCN-O0-NEXT: s_mov_b32 s0, s6 +; GCN-O0-NEXT: s_mov_b32 s1, s7 +; GCN-O0-NEXT: s_mov_b32 s7, s4 +; GCN-O0-NEXT: s_mov_b32 s6, s5 +; GCN-O0-NEXT: s_add_u32 s0, s0, s7 +; GCN-O0-NEXT: s_addc_u32 s6, s1, s6 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 +; GCN-O0-NEXT: flat_store_byte v[0:1], v4 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: flat_store_byte v[0:1], v3 +; GCN-O0-NEXT: s_mov_b32 s0, s2 +; GCN-O0-NEXT: s_mov_b32 s1, s3 +; GCN-O0-NEXT: s_mov_b32 s3, s4 +; GCN-O0-NEXT: s_mov_b32 s2, s5 +; GCN-O0-NEXT: s_add_u32 s0, s0, s3 +; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 ; GCN-O0-NEXT: flat_store_byte v[0:1], v2 ; GCN-O0-NEXT: s_endpgm entry: @@ -1318,13 +1561,25 @@ define amdgpu_kernel void @byte16_inselt(ptr addrspace(1) %out, <16 x i8> %vec, ; ; GCN-O0-LABEL: byte16_inselt: ; GCN-O0: ; %bb.0: ; %entry -; GCN-O0-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GCN-O0-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GCN-O0-NEXT: s_mov_b32 s14, -1 -; GCN-O0-NEXT: s_mov_b32 s15, 0xe80000 -; GCN-O0-NEXT: s_add_u32 s12, s12, s11 -; GCN-O0-NEXT: s_addc_u32 s13, s13, 0 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 52 +; GCN-O0-NEXT: s_mov_b32 s24, SCRATCH_RSRC_DWORD0 +; GCN-O0-NEXT: s_mov_b32 s25, SCRATCH_RSRC_DWORD1 +; GCN-O0-NEXT: s_mov_b32 s26, -1 +; GCN-O0-NEXT: s_mov_b32 s27, 0xe80000 +; GCN-O0-NEXT: s_add_u32 s24, s24, s11 +; GCN-O0-NEXT: s_addc_u32 s25, s25, 0 +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0x34 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0x38 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0x3c +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0x40 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0x44 +; GCN-O0-NEXT: s_mov_b64 s[6:7], 64 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) ; GCN-O0-NEXT: s_mov_b32 s0, s4 ; GCN-O0-NEXT: s_mov_b32 s1, s5 ; GCN-O0-NEXT: s_mov_b32 s3, s6 @@ -1336,90 +1591,6 @@ define amdgpu_kernel void @byte16_inselt(ptr addrspace(1) %out, <16 x i8> %vec, ; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 ; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 ; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] -; GCN-O0-NEXT: s_mov_b64 s[6:7], 53 -; GCN-O0-NEXT: s_mov_b32 s0, s4 -; GCN-O0-NEXT: s_mov_b32 s1, s5 -; GCN-O0-NEXT: s_mov_b32 s3, s6 -; GCN-O0-NEXT: s_mov_b32 s2, s7 -; GCN-O0-NEXT: s_add_u32 s0, s0, s3 -; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 -; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 -; GCN-O0-NEXT: s_mov_b32 s1, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s1 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s0 -; GCN-O0-NEXT: flat_load_ubyte v1, v[1:2] -; GCN-O0-NEXT: s_mov_b64 s[6:7], 54 -; GCN-O0-NEXT: s_mov_b32 s0, s4 -; GCN-O0-NEXT: s_mov_b32 s1, s5 -; GCN-O0-NEXT: s_mov_b32 s3, s6 -; GCN-O0-NEXT: s_mov_b32 s2, s7 -; GCN-O0-NEXT: s_add_u32 s0, s0, s3 -; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 -; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 -; GCN-O0-NEXT: s_mov_b32 s1, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v3, s1 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s0 -; GCN-O0-NEXT: flat_load_ubyte v2, v[2:3] -; GCN-O0-NEXT: s_mov_b64 s[6:7], 55 -; GCN-O0-NEXT: s_mov_b32 s0, s4 -; GCN-O0-NEXT: s_mov_b32 s1, s5 -; GCN-O0-NEXT: s_mov_b32 s3, s6 -; GCN-O0-NEXT: s_mov_b32 s2, s7 -; GCN-O0-NEXT: s_add_u32 s0, s0, s3 -; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 -; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 -; GCN-O0-NEXT: s_mov_b32 s1, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v4, s1 -; GCN-O0-NEXT: v_mov_b32_e32 v3, s0 -; GCN-O0-NEXT: flat_load_ubyte v3, v[3:4] -; GCN-O0-NEXT: s_mov_b64 s[6:7], 56 -; GCN-O0-NEXT: s_mov_b32 s0, s4 -; GCN-O0-NEXT: s_mov_b32 s1, s5 -; GCN-O0-NEXT: s_mov_b32 s3, s6 -; GCN-O0-NEXT: s_mov_b32 s2, s7 -; GCN-O0-NEXT: s_add_u32 s0, s0, s3 -; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 -; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 -; GCN-O0-NEXT: s_mov_b32 s1, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v5, s1 -; GCN-O0-NEXT: v_mov_b32_e32 v4, s0 -; GCN-O0-NEXT: flat_load_ubyte v4, v[4:5] -; GCN-O0-NEXT: s_mov_b64 s[6:7], 57 -; GCN-O0-NEXT: s_mov_b32 s0, s4 -; GCN-O0-NEXT: s_mov_b32 s1, s5 -; GCN-O0-NEXT: s_mov_b32 s3, s6 -; GCN-O0-NEXT: s_mov_b32 s2, s7 -; GCN-O0-NEXT: s_add_u32 s0, s0, s3 -; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 -; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 -; GCN-O0-NEXT: s_mov_b32 s1, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v6, s1 -; GCN-O0-NEXT: v_mov_b32_e32 v5, s0 -; GCN-O0-NEXT: flat_load_ubyte v5, v[5:6] -; GCN-O0-NEXT: s_mov_b64 s[6:7], 58 -; GCN-O0-NEXT: s_mov_b32 s0, s4 -; GCN-O0-NEXT: s_mov_b32 s1, s5 -; GCN-O0-NEXT: s_mov_b32 s3, s6 -; GCN-O0-NEXT: s_mov_b32 s2, s7 -; GCN-O0-NEXT: s_add_u32 s0, s0, s3 -; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 -; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 -; GCN-O0-NEXT: s_mov_b32 s1, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v7, s1 -; GCN-O0-NEXT: v_mov_b32_e32 v6, s0 -; GCN-O0-NEXT: flat_load_ubyte v6, v[6:7] -; GCN-O0-NEXT: s_mov_b64 s[6:7], 59 -; GCN-O0-NEXT: s_mov_b32 s0, s4 -; GCN-O0-NEXT: s_mov_b32 s1, s5 -; GCN-O0-NEXT: s_mov_b32 s3, s6 -; GCN-O0-NEXT: s_mov_b32 s2, s7 -; GCN-O0-NEXT: s_add_u32 s0, s0, s3 -; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 -; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 -; GCN-O0-NEXT: s_mov_b32 s1, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v8, s1 -; GCN-O0-NEXT: v_mov_b32_e32 v7, s0 -; GCN-O0-NEXT: flat_load_ubyte v7, v[7:8] ; GCN-O0-NEXT: s_mov_b64 s[6:7], 60 ; GCN-O0-NEXT: s_mov_b32 s0, s4 ; GCN-O0-NEXT: s_mov_b32 s1, s5 @@ -1429,84 +1600,200 @@ define amdgpu_kernel void @byte16_inselt(ptr addrspace(1) %out, <16 x i8> %vec, ; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 ; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; GCN-O0-NEXT: s_mov_b32 s1, s2 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 +; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] +; GCN-O0-NEXT: s_mov_b64 s[6:7], 56 +; GCN-O0-NEXT: s_mov_b32 s0, s4 +; GCN-O0-NEXT: s_mov_b32 s1, s5 +; GCN-O0-NEXT: s_mov_b32 s3, s6 +; GCN-O0-NEXT: s_mov_b32 s2, s7 +; GCN-O0-NEXT: s_add_u32 s0, s0, s3 +; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s2 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 +; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] +; GCN-O0-NEXT: s_mov_b64 s[6:7], 52 +; GCN-O0-NEXT: s_mov_b32 s2, s4 +; GCN-O0-NEXT: s_mov_b32 s0, s5 +; GCN-O0-NEXT: s_mov_b32 s3, s6 +; GCN-O0-NEXT: s_mov_b32 s1, s7 +; GCN-O0-NEXT: s_add_u32 s18, s2, s3 +; GCN-O0-NEXT: s_addc_u32 s0, s0, s1 +; GCN-O0-NEXT: ; kill: def $sgpr18 killed $sgpr18 def $sgpr18_sgpr19 +; GCN-O0-NEXT: s_mov_b32 s19, s0 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_mov_b32_e32 v0, s18 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s19 +; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] +; GCN-O0-NEXT: s_mov_b64 s[12:13], 8 +; GCN-O0-NEXT: s_mov_b32 s2, s18 +; GCN-O0-NEXT: s_mov_b32 s0, s19 +; GCN-O0-NEXT: s_mov_b32 s3, s12 +; GCN-O0-NEXT: s_mov_b32 s1, s13 +; GCN-O0-NEXT: s_add_u32 s16, s2, s3 +; GCN-O0-NEXT: s_addc_u32 s0, s0, s1 +; GCN-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17 +; GCN-O0-NEXT: s_mov_b32 s17, s0 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_mov_b32_e32 v0, s16 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s17 +; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] +; GCN-O0-NEXT: s_mov_b64 s[10:11], 4 +; GCN-O0-NEXT: s_mov_b32 s2, s18 +; GCN-O0-NEXT: s_mov_b32 s0, s19 +; GCN-O0-NEXT: s_mov_b32 s3, s10 +; GCN-O0-NEXT: s_mov_b32 s1, s11 +; GCN-O0-NEXT: s_add_u32 s14, s2, s3 +; GCN-O0-NEXT: s_addc_u32 s0, s0, s1 +; GCN-O0-NEXT: ; kill: def $sgpr14 killed $sgpr14 def $sgpr14_sgpr15 +; GCN-O0-NEXT: s_mov_b32 s15, s0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s14 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s15 +; GCN-O0-NEXT: flat_load_ubyte v1, v[1:2] +; GCN-O0-NEXT: s_mov_b64 s[8:9], 2 +; GCN-O0-NEXT: s_mov_b32 s2, s18 +; GCN-O0-NEXT: s_mov_b32 s0, s19 +; GCN-O0-NEXT: s_mov_b32 s3, s8 +; GCN-O0-NEXT: s_mov_b32 s1, s9 +; GCN-O0-NEXT: s_add_u32 s2, s2, s3 +; GCN-O0-NEXT: s_addc_u32 s0, s0, s1 +; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; GCN-O0-NEXT: s_mov_b32 s3, s0 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s3 +; GCN-O0-NEXT: flat_load_ubyte v2, v[2:3] +; GCN-O0-NEXT: v_mov_b32_e32 v3, s18 +; GCN-O0-NEXT: v_mov_b32_e32 v4, s19 +; GCN-O0-NEXT: flat_load_ubyte v3, v[3:4] +; GCN-O0-NEXT: s_mov_b64 s[6:7], 1 +; GCN-O0-NEXT: s_mov_b32 s0, s18 +; GCN-O0-NEXT: s_mov_b32 s1, s19 +; GCN-O0-NEXT: s_mov_b32 s19, s6 +; GCN-O0-NEXT: s_mov_b32 s18, s7 +; GCN-O0-NEXT: s_add_u32 s0, s0, s19 +; GCN-O0-NEXT: s_addc_u32 s18, s1, s18 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s18 +; GCN-O0-NEXT: v_mov_b32_e32 v5, s1 +; GCN-O0-NEXT: v_mov_b32_e32 v4, s0 +; GCN-O0-NEXT: flat_load_ubyte v4, v[4:5] +; GCN-O0-NEXT: s_mov_b32 s0, s16 +; GCN-O0-NEXT: s_mov_b32 s1, s17 +; GCN-O0-NEXT: s_mov_b32 s19, s10 +; GCN-O0-NEXT: s_mov_b32 s18, s11 +; GCN-O0-NEXT: s_add_u32 s0, s0, s19 +; GCN-O0-NEXT: s_addc_u32 s18, s1, s18 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s18 +; GCN-O0-NEXT: s_mov_b32 s18, s0 +; GCN-O0-NEXT: s_mov_b32 s19, s1 +; GCN-O0-NEXT: s_mov_b32 s21, s8 +; GCN-O0-NEXT: s_mov_b32 s20, s9 +; GCN-O0-NEXT: s_add_u32 s18, s18, s21 +; GCN-O0-NEXT: s_addc_u32 s20, s19, s20 +; GCN-O0-NEXT: ; kill: def $sgpr18 killed $sgpr18 def $sgpr18_sgpr19 +; GCN-O0-NEXT: s_mov_b32 s19, s20 +; GCN-O0-NEXT: s_mov_b32 s20, s18 +; GCN-O0-NEXT: s_mov_b32 s21, s19 +; GCN-O0-NEXT: s_mov_b32 s23, s6 +; GCN-O0-NEXT: s_mov_b32 s22, s7 +; GCN-O0-NEXT: s_add_u32 s20, s20, s23 +; GCN-O0-NEXT: s_addc_u32 s22, s21, s22 +; GCN-O0-NEXT: ; kill: def $sgpr20 killed $sgpr20 def $sgpr20_sgpr21 +; GCN-O0-NEXT: s_mov_b32 s21, s22 +; GCN-O0-NEXT: v_mov_b32_e32 v5, s20 +; GCN-O0-NEXT: v_mov_b32_e32 v6, s21 +; GCN-O0-NEXT: flat_load_ubyte v5, v[5:6] +; GCN-O0-NEXT: v_mov_b32_e32 v6, s18 +; GCN-O0-NEXT: v_mov_b32_e32 v7, s19 +; GCN-O0-NEXT: flat_load_ubyte v6, v[6:7] +; GCN-O0-NEXT: s_mov_b32 s18, s0 +; GCN-O0-NEXT: s_mov_b32 s19, s1 +; GCN-O0-NEXT: s_mov_b32 s21, s6 +; GCN-O0-NEXT: s_mov_b32 s20, s7 +; GCN-O0-NEXT: s_add_u32 s18, s18, s21 +; GCN-O0-NEXT: s_addc_u32 s20, s19, s20 +; GCN-O0-NEXT: ; kill: def $sgpr18 killed $sgpr18 def $sgpr18_sgpr19 +; GCN-O0-NEXT: s_mov_b32 s19, s20 +; GCN-O0-NEXT: v_mov_b32_e32 v7, s18 +; GCN-O0-NEXT: v_mov_b32_e32 v8, s19 +; GCN-O0-NEXT: flat_load_ubyte v7, v[7:8] ; GCN-O0-NEXT: v_mov_b32_e32 v9, s1 ; GCN-O0-NEXT: v_mov_b32_e32 v8, s0 ; GCN-O0-NEXT: flat_load_ubyte v8, v[8:9] -; GCN-O0-NEXT: s_mov_b64 s[6:7], 61 -; GCN-O0-NEXT: s_mov_b32 s0, s4 -; GCN-O0-NEXT: s_mov_b32 s1, s5 -; GCN-O0-NEXT: s_mov_b32 s3, s6 -; GCN-O0-NEXT: s_mov_b32 s2, s7 -; GCN-O0-NEXT: s_add_u32 s0, s0, s3 -; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 +; GCN-O0-NEXT: s_mov_b32 s0, s16 +; GCN-O0-NEXT: s_mov_b32 s1, s17 +; GCN-O0-NEXT: s_mov_b32 s19, s8 +; GCN-O0-NEXT: s_mov_b32 s18, s9 +; GCN-O0-NEXT: s_add_u32 s0, s0, s19 +; GCN-O0-NEXT: s_addc_u32 s18, s1, s18 ; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 -; GCN-O0-NEXT: s_mov_b32 s1, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v10, s1 -; GCN-O0-NEXT: v_mov_b32_e32 v9, s0 +; GCN-O0-NEXT: s_mov_b32 s1, s18 +; GCN-O0-NEXT: s_mov_b32 s18, s0 +; GCN-O0-NEXT: s_mov_b32 s19, s1 +; GCN-O0-NEXT: s_mov_b32 s21, s6 +; GCN-O0-NEXT: s_mov_b32 s20, s7 +; GCN-O0-NEXT: s_add_u32 s18, s18, s21 +; GCN-O0-NEXT: s_addc_u32 s20, s19, s20 +; GCN-O0-NEXT: ; kill: def $sgpr18 killed $sgpr18 def $sgpr18_sgpr19 +; GCN-O0-NEXT: s_mov_b32 s19, s20 +; GCN-O0-NEXT: v_mov_b32_e32 v9, s18 +; GCN-O0-NEXT: v_mov_b32_e32 v10, s19 ; GCN-O0-NEXT: flat_load_ubyte v9, v[9:10] -; GCN-O0-NEXT: s_mov_b64 s[6:7], 62 -; GCN-O0-NEXT: s_mov_b32 s0, s4 -; GCN-O0-NEXT: s_mov_b32 s1, s5 -; GCN-O0-NEXT: s_mov_b32 s3, s6 -; GCN-O0-NEXT: s_mov_b32 s2, s7 -; GCN-O0-NEXT: s_add_u32 s0, s0, s3 -; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 -; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 -; GCN-O0-NEXT: s_mov_b32 s1, s2 ; GCN-O0-NEXT: v_mov_b32_e32 v11, s1 ; GCN-O0-NEXT: v_mov_b32_e32 v10, s0 ; GCN-O0-NEXT: flat_load_ubyte v10, v[10:11] -; GCN-O0-NEXT: s_mov_b64 s[6:7], 63 -; GCN-O0-NEXT: s_mov_b32 s0, s4 -; GCN-O0-NEXT: s_mov_b32 s1, s5 -; GCN-O0-NEXT: s_mov_b32 s3, s6 -; GCN-O0-NEXT: s_mov_b32 s2, s7 -; GCN-O0-NEXT: s_add_u32 s0, s0, s3 -; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 +; GCN-O0-NEXT: s_mov_b32 s0, s16 +; GCN-O0-NEXT: s_mov_b32 s1, s17 +; GCN-O0-NEXT: s_mov_b32 s17, s6 +; GCN-O0-NEXT: s_mov_b32 s16, s7 +; GCN-O0-NEXT: s_add_u32 s0, s0, s17 +; GCN-O0-NEXT: s_addc_u32 s16, s1, s16 ; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 -; GCN-O0-NEXT: s_mov_b32 s1, s2 +; GCN-O0-NEXT: s_mov_b32 s1, s16 ; GCN-O0-NEXT: v_mov_b32_e32 v12, s1 ; GCN-O0-NEXT: v_mov_b32_e32 v11, s0 ; GCN-O0-NEXT: flat_load_ubyte v11, v[11:12] -; GCN-O0-NEXT: s_mov_b64 s[6:7], 64 -; GCN-O0-NEXT: s_mov_b32 s0, s4 -; GCN-O0-NEXT: s_mov_b32 s1, s5 -; GCN-O0-NEXT: s_mov_b32 s3, s6 -; GCN-O0-NEXT: s_mov_b32 s2, s7 -; GCN-O0-NEXT: s_add_u32 s0, s0, s3 -; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 +; GCN-O0-NEXT: s_mov_b32 s0, s14 +; GCN-O0-NEXT: s_mov_b32 s1, s15 +; GCN-O0-NEXT: s_mov_b32 s17, s8 +; GCN-O0-NEXT: s_mov_b32 s16, s9 +; GCN-O0-NEXT: s_add_u32 s0, s0, s17 +; GCN-O0-NEXT: s_addc_u32 s16, s1, s16 ; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 -; GCN-O0-NEXT: s_mov_b32 s1, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v13, s1 -; GCN-O0-NEXT: v_mov_b32_e32 v12, s0 +; GCN-O0-NEXT: s_mov_b32 s1, s16 +; GCN-O0-NEXT: s_mov_b32 s16, s0 +; GCN-O0-NEXT: s_mov_b32 s17, s1 +; GCN-O0-NEXT: s_mov_b32 s19, s6 +; GCN-O0-NEXT: s_mov_b32 s18, s7 +; GCN-O0-NEXT: s_add_u32 s16, s16, s19 +; GCN-O0-NEXT: s_addc_u32 s18, s17, s18 +; GCN-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17 +; GCN-O0-NEXT: s_mov_b32 s17, s18 +; GCN-O0-NEXT: v_mov_b32_e32 v12, s16 +; GCN-O0-NEXT: v_mov_b32_e32 v13, s17 ; GCN-O0-NEXT: flat_load_ubyte v12, v[12:13] -; GCN-O0-NEXT: s_mov_b64 s[6:7], 0x41 -; GCN-O0-NEXT: s_mov_b32 s0, s4 -; GCN-O0-NEXT: s_mov_b32 s1, s5 -; GCN-O0-NEXT: s_mov_b32 s3, s6 -; GCN-O0-NEXT: s_mov_b32 s2, s7 -; GCN-O0-NEXT: s_add_u32 s0, s0, s3 -; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 -; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 -; GCN-O0-NEXT: s_mov_b32 s1, s2 ; GCN-O0-NEXT: v_mov_b32_e32 v14, s1 ; GCN-O0-NEXT: v_mov_b32_e32 v13, s0 ; GCN-O0-NEXT: flat_load_ubyte v13, v[13:14] -; GCN-O0-NEXT: s_mov_b64 s[6:7], 0x42 -; GCN-O0-NEXT: s_mov_b32 s0, s4 -; GCN-O0-NEXT: s_mov_b32 s1, s5 -; GCN-O0-NEXT: s_mov_b32 s3, s6 -; GCN-O0-NEXT: s_mov_b32 s2, s7 -; GCN-O0-NEXT: s_add_u32 s0, s0, s3 -; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 +; GCN-O0-NEXT: s_mov_b32 s0, s14 +; GCN-O0-NEXT: s_mov_b32 s1, s15 +; GCN-O0-NEXT: s_mov_b32 s15, s6 +; GCN-O0-NEXT: s_mov_b32 s14, s7 +; GCN-O0-NEXT: s_add_u32 s0, s0, s15 +; GCN-O0-NEXT: s_addc_u32 s14, s1, s14 ; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 -; GCN-O0-NEXT: s_mov_b32 s1, s2 +; GCN-O0-NEXT: s_mov_b32 s1, s14 ; GCN-O0-NEXT: v_mov_b32_e32 v15, s1 ; GCN-O0-NEXT: v_mov_b32_e32 v14, s0 ; GCN-O0-NEXT: flat_load_ubyte v14, v[14:15] -; GCN-O0-NEXT: s_mov_b64 s[6:7], 0x43 -; GCN-O0-NEXT: s_mov_b32 s0, s4 -; GCN-O0-NEXT: s_mov_b32 s1, s5 +; GCN-O0-NEXT: s_mov_b32 s0, s2 +; GCN-O0-NEXT: s_mov_b32 s1, s3 ; GCN-O0-NEXT: s_mov_b32 s3, s6 ; GCN-O0-NEXT: s_mov_b32 s2, s7 ; GCN-O0-NEXT: s_add_u32 s0, s0, s3 @@ -1520,215 +1807,235 @@ define amdgpu_kernel void @byte16_inselt(ptr addrspace(1) %out, <16 x i8> %vec, ; GCN-O0-NEXT: s_load_dword s2, s[4:5], 0x44 ; GCN-O0-NEXT: s_mov_b32 s3, 15 ; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: s_and_b32 s3, s2, s3 -; GCN-O0-NEXT: s_mov_b32 s2, 0 -; GCN-O0-NEXT: s_or_b32 s2, s2, s3 +; GCN-O0-NEXT: s_and_b32 s2, s2, s3 +; GCN-O0-NEXT: s_mov_b32 s3, 1 +; GCN-O0-NEXT: s_mul_i32 s2, s2, s3 +; GCN-O0-NEXT: s_mov_b32 s4, 0 +; GCN-O0-NEXT: s_add_i32 s17, s4, s2 +; GCN-O0-NEXT: s_mov_b32 s2, 2 +; GCN-O0-NEXT: s_add_i32 s16, s4, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v16, s16 ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v15, off, s[12:15], 0 offset:15 -; GCN-O0-NEXT: buffer_store_byte v14, off, s[12:15], 0 offset:14 -; GCN-O0-NEXT: buffer_store_byte v13, off, s[12:15], 0 offset:13 -; GCN-O0-NEXT: buffer_store_byte v12, off, s[12:15], 0 offset:12 -; GCN-O0-NEXT: buffer_store_byte v11, off, s[12:15], 0 offset:11 -; GCN-O0-NEXT: buffer_store_byte v10, off, s[12:15], 0 offset:10 -; GCN-O0-NEXT: buffer_store_byte v9, off, s[12:15], 0 offset:9 -; GCN-O0-NEXT: buffer_store_byte v8, off, s[12:15], 0 offset:8 -; GCN-O0-NEXT: buffer_store_byte v7, off, s[12:15], 0 offset:7 -; GCN-O0-NEXT: buffer_store_byte v6, off, s[12:15], 0 offset:6 -; GCN-O0-NEXT: buffer_store_byte v5, off, s[12:15], 0 offset:5 -; GCN-O0-NEXT: buffer_store_byte v4, off, s[12:15], 0 offset:4 -; GCN-O0-NEXT: buffer_store_byte v3, off, s[12:15], 0 offset:3 -; GCN-O0-NEXT: buffer_store_byte v2, off, s[12:15], 0 offset:2 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:1 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 +; GCN-O0-NEXT: buffer_store_byte v15, v16, s[24:27], 0 offen offset:1 +; GCN-O0-NEXT: s_mov_b32 s3, 4 +; GCN-O0-NEXT: s_add_i32 s15, s4, s3 +; GCN-O0-NEXT: v_mov_b32_e32 v15, s15 +; GCN-O0-NEXT: buffer_store_byte v14, v15, s[24:27], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v14, s15 +; GCN-O0-NEXT: buffer_store_byte v13, v14, s[24:27], 0 offen offset:2 +; GCN-O0-NEXT: s_add_i32 s14, s15, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v13, s14 +; GCN-O0-NEXT: buffer_store_byte v12, v13, s[24:27], 0 offen offset:1 +; GCN-O0-NEXT: s_mov_b32 s5, 8 +; GCN-O0-NEXT: s_add_i32 s4, s4, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v12, s4 +; GCN-O0-NEXT: buffer_store_byte v11, v12, s[24:27], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v11, s4 +; GCN-O0-NEXT: buffer_store_byte v10, v11, s[24:27], 0 offen offset:2 +; GCN-O0-NEXT: s_add_i32 s5, s4, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v10, s5 +; GCN-O0-NEXT: buffer_store_byte v9, v10, s[24:27], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v9, s4 +; GCN-O0-NEXT: buffer_store_byte v8, v9, s[24:27], 0 offen offset:4 +; GCN-O0-NEXT: s_add_i32 s3, s4, s3 +; GCN-O0-NEXT: v_mov_b32_e32 v8, s3 +; GCN-O0-NEXT: buffer_store_byte v7, v8, s[24:27], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v7, s3 +; GCN-O0-NEXT: buffer_store_byte v6, v7, s[24:27], 0 offen offset:2 +; GCN-O0-NEXT: s_add_i32 s2, s3, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v6, s2 +; GCN-O0-NEXT: buffer_store_byte v5, v6, s[24:27], 0 offen offset:1 +; GCN-O0-NEXT: buffer_store_byte v4, off, s[24:27], 0 offset:1 +; GCN-O0-NEXT: buffer_store_byte v3, off, s[24:27], 0 +; GCN-O0-NEXT: buffer_store_byte v2, off, s[24:27], 0 offset:2 +; GCN-O0-NEXT: buffer_store_byte v1, off, s[24:27], 0 offset:4 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[24:27], 0 offset:8 ; GCN-O0-NEXT: v_mov_b32_e32 v0, 1 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s2 -; GCN-O0-NEXT: buffer_store_byte v0, v1, s[12:15], 0 offen -; GCN-O0-NEXT: buffer_load_ubyte v2, off, s[12:15], 0 -; GCN-O0-NEXT: buffer_load_ubyte v3, off, s[12:15], 0 offset:1 -; GCN-O0-NEXT: buffer_load_ubyte v4, off, s[12:15], 0 offset:2 -; GCN-O0-NEXT: buffer_load_ubyte v5, off, s[12:15], 0 offset:3 -; GCN-O0-NEXT: buffer_load_ubyte v6, off, s[12:15], 0 offset:4 -; GCN-O0-NEXT: buffer_load_ubyte v7, off, s[12:15], 0 offset:5 -; GCN-O0-NEXT: buffer_load_ubyte v8, off, s[12:15], 0 offset:6 -; GCN-O0-NEXT: buffer_load_ubyte v9, off, s[12:15], 0 offset:7 -; GCN-O0-NEXT: buffer_load_ubyte v10, off, s[12:15], 0 offset:8 -; GCN-O0-NEXT: buffer_load_ubyte v11, off, s[12:15], 0 offset:9 -; GCN-O0-NEXT: buffer_load_ubyte v12, off, s[12:15], 0 offset:10 -; GCN-O0-NEXT: buffer_load_ubyte v13, off, s[12:15], 0 offset:11 -; GCN-O0-NEXT: buffer_load_ubyte v14, off, s[12:15], 0 offset:12 -; GCN-O0-NEXT: buffer_load_ubyte v15, off, s[12:15], 0 offset:13 -; GCN-O0-NEXT: buffer_load_ubyte v16, off, s[12:15], 0 offset:14 -; GCN-O0-NEXT: buffer_load_ubyte v17, off, s[12:15], 0 offset:15 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 15 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s17 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[24:27], 0 offen +; GCN-O0-NEXT: v_mov_b32_e32 v0, s16 +; GCN-O0-NEXT: buffer_load_ubyte v5, v0, s[24:27], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s15 +; GCN-O0-NEXT: buffer_load_ubyte v7, v0, s[24:27], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s15 +; GCN-O0-NEXT: buffer_load_ubyte v8, v0, s[24:27], 0 offen offset:2 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s14 +; GCN-O0-NEXT: buffer_load_ubyte v9, v0, s[24:27], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s5 +; GCN-O0-NEXT: buffer_load_ubyte v13, v0, s[24:27], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 +; GCN-O0-NEXT: buffer_load_ubyte v11, v0, s[24:27], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 +; GCN-O0-NEXT: buffer_load_ubyte v12, v0, s[24:27], 0 offen offset:2 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 +; GCN-O0-NEXT: buffer_load_ubyte v14, v0, s[24:27], 0 offen offset:4 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s3 +; GCN-O0-NEXT: buffer_load_ubyte v15, v0, s[24:27], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s3 +; GCN-O0-NEXT: buffer_load_ubyte v16, v0, s[24:27], 0 offen offset:2 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 +; GCN-O0-NEXT: buffer_load_ubyte v17, v0, s[24:27], 0 offen offset:1 +; GCN-O0-NEXT: buffer_load_ubyte v2, off, s[24:27], 0 +; GCN-O0-NEXT: buffer_load_ubyte v3, off, s[24:27], 0 offset:1 +; GCN-O0-NEXT: buffer_load_ubyte v4, off, s[24:27], 0 offset:2 +; GCN-O0-NEXT: buffer_load_ubyte v6, off, s[24:27], 0 offset:4 +; GCN-O0-NEXT: buffer_load_ubyte v10, off, s[24:27], 0 offset:8 ; GCN-O0-NEXT: s_mov_b32 s2, s0 ; GCN-O0-NEXT: s_mov_b32 s3, s1 -; GCN-O0-NEXT: s_mov_b32 s5, s6 -; GCN-O0-NEXT: s_mov_b32 s4, s7 +; GCN-O0-NEXT: s_mov_b32 s5, s12 +; GCN-O0-NEXT: s_mov_b32 s4, s13 ; GCN-O0-NEXT: s_add_u32 s2, s2, s5 ; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 ; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 ; GCN-O0-NEXT: s_mov_b32 s3, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: s_mov_b32 s4, s2 +; GCN-O0-NEXT: s_mov_b32 s5, s3 +; GCN-O0-NEXT: s_mov_b32 s13, s10 +; GCN-O0-NEXT: s_mov_b32 s12, s11 +; GCN-O0-NEXT: s_add_u32 s4, s4, s13 +; GCN-O0-NEXT: s_addc_u32 s12, s5, s12 +; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GCN-O0-NEXT: s_mov_b32 s5, s12 +; GCN-O0-NEXT: s_mov_b32 s12, s4 +; GCN-O0-NEXT: s_mov_b32 s13, s5 +; GCN-O0-NEXT: s_mov_b32 s15, s8 +; GCN-O0-NEXT: s_mov_b32 s14, s9 +; GCN-O0-NEXT: s_add_u32 s12, s12, s15 +; GCN-O0-NEXT: s_addc_u32 s14, s13, s14 +; GCN-O0-NEXT: ; kill: def $sgpr12 killed $sgpr12 def $sgpr12_sgpr13 +; GCN-O0-NEXT: s_mov_b32 s13, s14 +; GCN-O0-NEXT: s_mov_b32 s14, s12 +; GCN-O0-NEXT: s_mov_b32 s15, s13 +; GCN-O0-NEXT: s_mov_b32 s17, s6 +; GCN-O0-NEXT: s_mov_b32 s16, s7 +; GCN-O0-NEXT: s_add_u32 s14, s14, s17 +; GCN-O0-NEXT: s_addc_u32 s16, s15, s16 +; GCN-O0-NEXT: ; kill: def $sgpr14 killed $sgpr14 def $sgpr14_sgpr15 +; GCN-O0-NEXT: s_mov_b32 s15, s16 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s14 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s15 +; GCN-O0-NEXT: s_waitcnt vmcnt(5) ; GCN-O0-NEXT: flat_store_byte v[0:1], v17 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 14 -; GCN-O0-NEXT: s_mov_b32 s2, s0 -; GCN-O0-NEXT: s_mov_b32 s3, s1 -; GCN-O0-NEXT: s_mov_b32 s5, s6 -; GCN-O0-NEXT: s_mov_b32 s4, s7 -; GCN-O0-NEXT: s_add_u32 s2, s2, s5 -; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 -; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s12 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s13 ; GCN-O0-NEXT: flat_store_byte v[0:1], v16 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 13 -; GCN-O0-NEXT: s_mov_b32 s2, s0 -; GCN-O0-NEXT: s_mov_b32 s3, s1 -; GCN-O0-NEXT: s_mov_b32 s5, s6 -; GCN-O0-NEXT: s_mov_b32 s4, s7 -; GCN-O0-NEXT: s_add_u32 s2, s2, s5 -; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 -; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: s_mov_b32 s12, s4 +; GCN-O0-NEXT: s_mov_b32 s13, s5 +; GCN-O0-NEXT: s_mov_b32 s15, s6 +; GCN-O0-NEXT: s_mov_b32 s14, s7 +; GCN-O0-NEXT: s_add_u32 s12, s12, s15 +; GCN-O0-NEXT: s_addc_u32 s14, s13, s14 +; GCN-O0-NEXT: ; kill: def $sgpr12 killed $sgpr12 def $sgpr12_sgpr13 +; GCN-O0-NEXT: s_mov_b32 s13, s14 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s12 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s13 ; GCN-O0-NEXT: flat_store_byte v[0:1], v15 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 12 -; GCN-O0-NEXT: s_mov_b32 s2, s0 -; GCN-O0-NEXT: s_mov_b32 s3, s1 -; GCN-O0-NEXT: s_mov_b32 s5, s6 -; GCN-O0-NEXT: s_mov_b32 s4, s7 -; GCN-O0-NEXT: s_add_u32 s2, s2, s5 -; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 -; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 ; GCN-O0-NEXT: flat_store_byte v[0:1], v14 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 11 -; GCN-O0-NEXT: s_mov_b32 s2, s0 -; GCN-O0-NEXT: s_mov_b32 s3, s1 -; GCN-O0-NEXT: s_mov_b32 s5, s6 -; GCN-O0-NEXT: s_mov_b32 s4, s7 -; GCN-O0-NEXT: s_add_u32 s2, s2, s5 -; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 -; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: s_mov_b32 s4, s2 +; GCN-O0-NEXT: s_mov_b32 s5, s3 +; GCN-O0-NEXT: s_mov_b32 s13, s8 +; GCN-O0-NEXT: s_mov_b32 s12, s9 +; GCN-O0-NEXT: s_add_u32 s4, s4, s13 +; GCN-O0-NEXT: s_addc_u32 s12, s5, s12 +; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GCN-O0-NEXT: s_mov_b32 s5, s12 +; GCN-O0-NEXT: s_mov_b32 s12, s4 +; GCN-O0-NEXT: s_mov_b32 s13, s5 +; GCN-O0-NEXT: s_mov_b32 s15, s6 +; GCN-O0-NEXT: s_mov_b32 s14, s7 +; GCN-O0-NEXT: s_add_u32 s12, s12, s15 +; GCN-O0-NEXT: s_addc_u32 s14, s13, s14 +; GCN-O0-NEXT: ; kill: def $sgpr12 killed $sgpr12 def $sgpr12_sgpr13 +; GCN-O0-NEXT: s_mov_b32 s13, s14 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s12 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s13 ; GCN-O0-NEXT: flat_store_byte v[0:1], v13 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 10 -; GCN-O0-NEXT: s_mov_b32 s2, s0 -; GCN-O0-NEXT: s_mov_b32 s3, s1 -; GCN-O0-NEXT: s_mov_b32 s5, s6 -; GCN-O0-NEXT: s_mov_b32 s4, s7 -; GCN-O0-NEXT: s_add_u32 s2, s2, s5 -; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 -; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 ; GCN-O0-NEXT: flat_store_byte v[0:1], v12 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 9 -; GCN-O0-NEXT: s_mov_b32 s2, s0 -; GCN-O0-NEXT: s_mov_b32 s3, s1 -; GCN-O0-NEXT: s_mov_b32 s5, s6 -; GCN-O0-NEXT: s_mov_b32 s4, s7 -; GCN-O0-NEXT: s_add_u32 s2, s2, s5 -; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 -; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: s_mov_b32 s4, s2 +; GCN-O0-NEXT: s_mov_b32 s5, s3 +; GCN-O0-NEXT: s_mov_b32 s13, s6 +; GCN-O0-NEXT: s_mov_b32 s12, s7 +; GCN-O0-NEXT: s_add_u32 s4, s4, s13 +; GCN-O0-NEXT: s_addc_u32 s12, s5, s12 +; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GCN-O0-NEXT: s_mov_b32 s5, s12 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 ; GCN-O0-NEXT: flat_store_byte v[0:1], v11 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 8 -; GCN-O0-NEXT: s_mov_b32 s2, s0 -; GCN-O0-NEXT: s_mov_b32 s3, s1 -; GCN-O0-NEXT: s_mov_b32 s5, s6 -; GCN-O0-NEXT: s_mov_b32 s4, s7 -; GCN-O0-NEXT: s_add_u32 s2, s2, s5 -; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 -; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s4 ; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 ; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: s_waitcnt vmcnt(7) ; GCN-O0-NEXT: flat_store_byte v[0:1], v10 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 7 ; GCN-O0-NEXT: s_mov_b32 s2, s0 ; GCN-O0-NEXT: s_mov_b32 s3, s1 -; GCN-O0-NEXT: s_mov_b32 s5, s6 -; GCN-O0-NEXT: s_mov_b32 s4, s7 +; GCN-O0-NEXT: s_mov_b32 s5, s10 +; GCN-O0-NEXT: s_mov_b32 s4, s11 ; GCN-O0-NEXT: s_add_u32 s2, s2, s5 ; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 ; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 ; GCN-O0-NEXT: s_mov_b32 s3, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: s_mov_b32 s4, s2 +; GCN-O0-NEXT: s_mov_b32 s5, s3 +; GCN-O0-NEXT: s_mov_b32 s11, s8 +; GCN-O0-NEXT: s_mov_b32 s10, s9 +; GCN-O0-NEXT: s_add_u32 s4, s4, s11 +; GCN-O0-NEXT: s_addc_u32 s10, s5, s10 +; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GCN-O0-NEXT: s_mov_b32 s5, s10 +; GCN-O0-NEXT: s_mov_b32 s10, s4 +; GCN-O0-NEXT: s_mov_b32 s11, s5 +; GCN-O0-NEXT: s_mov_b32 s13, s6 +; GCN-O0-NEXT: s_mov_b32 s12, s7 +; GCN-O0-NEXT: s_add_u32 s10, s10, s13 +; GCN-O0-NEXT: s_addc_u32 s12, s11, s12 +; GCN-O0-NEXT: ; kill: def $sgpr10 killed $sgpr10 def $sgpr10_sgpr11 +; GCN-O0-NEXT: s_mov_b32 s11, s12 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s10 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s11 ; GCN-O0-NEXT: flat_store_byte v[0:1], v9 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 6 -; GCN-O0-NEXT: s_mov_b32 s2, s0 -; GCN-O0-NEXT: s_mov_b32 s3, s1 -; GCN-O0-NEXT: s_mov_b32 s5, s6 -; GCN-O0-NEXT: s_mov_b32 s4, s7 -; GCN-O0-NEXT: s_add_u32 s2, s2, s5 -; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 -; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 ; GCN-O0-NEXT: flat_store_byte v[0:1], v8 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 5 -; GCN-O0-NEXT: s_mov_b32 s2, s0 -; GCN-O0-NEXT: s_mov_b32 s3, s1 -; GCN-O0-NEXT: s_mov_b32 s5, s6 -; GCN-O0-NEXT: s_mov_b32 s4, s7 -; GCN-O0-NEXT: s_add_u32 s2, s2, s5 -; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 -; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: s_mov_b32 s4, s2 +; GCN-O0-NEXT: s_mov_b32 s5, s3 +; GCN-O0-NEXT: s_mov_b32 s11, s6 +; GCN-O0-NEXT: s_mov_b32 s10, s7 +; GCN-O0-NEXT: s_add_u32 s4, s4, s11 +; GCN-O0-NEXT: s_addc_u32 s10, s5, s10 +; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GCN-O0-NEXT: s_mov_b32 s5, s10 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 ; GCN-O0-NEXT: flat_store_byte v[0:1], v7 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 4 -; GCN-O0-NEXT: s_mov_b32 s2, s0 -; GCN-O0-NEXT: s_mov_b32 s3, s1 -; GCN-O0-NEXT: s_mov_b32 s5, s6 -; GCN-O0-NEXT: s_mov_b32 s4, s7 -; GCN-O0-NEXT: s_add_u32 s2, s2, s5 -; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 -; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s4 ; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 ; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 ; GCN-O0-NEXT: flat_store_byte v[0:1], v6 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 3 ; GCN-O0-NEXT: s_mov_b32 s2, s0 ; GCN-O0-NEXT: s_mov_b32 s3, s1 -; GCN-O0-NEXT: s_mov_b32 s5, s6 -; GCN-O0-NEXT: s_mov_b32 s4, s7 +; GCN-O0-NEXT: s_mov_b32 s5, s8 +; GCN-O0-NEXT: s_mov_b32 s4, s9 ; GCN-O0-NEXT: s_add_u32 s2, s2, s5 ; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 ; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 ; GCN-O0-NEXT: s_mov_b32 s3, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: s_mov_b32 s4, s2 +; GCN-O0-NEXT: s_mov_b32 s5, s3 +; GCN-O0-NEXT: s_mov_b32 s9, s6 +; GCN-O0-NEXT: s_mov_b32 s8, s7 +; GCN-O0-NEXT: s_add_u32 s4, s4, s9 +; GCN-O0-NEXT: s_addc_u32 s8, s5, s8 +; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GCN-O0-NEXT: s_mov_b32 s5, s8 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 ; GCN-O0-NEXT: flat_store_byte v[0:1], v5 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 2 -; GCN-O0-NEXT: s_mov_b32 s2, s0 -; GCN-O0-NEXT: s_mov_b32 s3, s1 -; GCN-O0-NEXT: s_mov_b32 s5, s6 -; GCN-O0-NEXT: s_mov_b32 s4, s7 -; GCN-O0-NEXT: s_add_u32 s2, s2, s5 -; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 -; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s4 ; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 ; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 ; GCN-O0-NEXT: flat_store_byte v[0:1], v4 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 1 ; GCN-O0-NEXT: s_mov_b32 s2, s0 ; GCN-O0-NEXT: s_mov_b32 s3, s1 ; GCN-O0-NEXT: s_mov_b32 s5, s6 @@ -1775,11 +2082,17 @@ define amdgpu_kernel void @double2_inselt(ptr addrspace(1) %out, <2 x double> %v ; GCN-O0-LABEL: double2_inselt: ; GCN-O0: ; %bb.0: ; %entry ; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0x44 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-O0-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 ; GCN-O0-NEXT: s_load_dword s2, s[4:5], 0x44 -; GCN-O0-NEXT: s_mov_b32 s3, 1 +; GCN-O0-NEXT: s_mov_b32 s3, 2 ; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: s_lshl_b32 s2, s2, s3 +; GCN-O0-NEXT: s_mul_i32 s2, s2, s3 ; GCN-O0-NEXT: s_mov_b64 s[4:5], 1.0 ; GCN-O0-NEXT: s_mov_b32 s3, s4 ; GCN-O0-NEXT: v_mov_b32_e32 v2, s8 @@ -1789,10 +2102,12 @@ define amdgpu_kernel void @double2_inselt(ptr addrspace(1) %out, <2 x double> %v ; GCN-O0-NEXT: v_mov_b32_e32 v0, s3 ; GCN-O0-NEXT: s_mov_b32 m0, s2 ; GCN-O0-NEXT: v_movreld_b32_e32 v2, v0 +; GCN-O0-NEXT: s_mov_b32 s3, 1 +; GCN-O0-NEXT: s_add_i32 s2, s2, s3 ; GCN-O0-NEXT: s_mov_b32 s3, s5 ; GCN-O0-NEXT: v_mov_b32_e32 v0, s3 ; GCN-O0-NEXT: s_mov_b32 m0, s2 -; GCN-O0-NEXT: v_movreld_b32_e32 v3, v0 +; GCN-O0-NEXT: v_movreld_b32_e32 v2, v0 ; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 ; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 ; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] @@ -1854,47 +2169,69 @@ define amdgpu_kernel void @double5_inselt(ptr addrspace(1) %out, <5 x double> %v ; ; GCN-O0-LABEL: double5_inselt: ; GCN-O0: ; %bb.0: ; %entry -; GCN-O0-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 -; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x84 +; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0xa4 +; GCN-O0-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-O0-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GCN-O0-NEXT: s_mov_b64 s[8:9], 0x64 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_mov_b32 s0, s4 +; GCN-O0-NEXT: s_mov_b32 s1, s5 +; GCN-O0-NEXT: s_mov_b32 s7, s8 +; GCN-O0-NEXT: s_mov_b32 s6, s9 +; GCN-O0-NEXT: s_add_u32 s0, s0, s7 +; GCN-O0-NEXT: s_addc_u32 s6, s1, s6 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s6 +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x20 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_mov_b32 s6, s1 +; GCN-O0-NEXT: s_mov_b32 s7, s0 +; GCN-O0-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x64 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_mov_b32 s8, s23 +; GCN-O0-NEXT: s_mov_b32 s0, s22 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s8 +; GCN-O0-NEXT: s_mov_b32 s8, s1 +; GCN-O0-NEXT: s_mov_b32 s9, s0 +; GCN-O0-NEXT: s_mov_b32 s10, s21 +; GCN-O0-NEXT: s_mov_b32 s0, s20 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s10 ; GCN-O0-NEXT: s_mov_b32 s10, s1 ; GCN-O0-NEXT: s_mov_b32 s11, s0 -; GCN-O0-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x64 -; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: s_mov_b32 s12, s27 -; GCN-O0-NEXT: s_mov_b32 s13, s26 -; GCN-O0-NEXT: s_mov_b32 s14, s25 -; GCN-O0-NEXT: s_mov_b32 s15, s24 -; GCN-O0-NEXT: s_mov_b32 s16, s23 -; GCN-O0-NEXT: s_mov_b32 s17, s22 -; GCN-O0-NEXT: s_mov_b32 s18, s21 -; GCN-O0-NEXT: s_mov_b32 s19, s20 -; GCN-O0-NEXT: ; implicit-def: $sgpr9 -; GCN-O0-NEXT: ; implicit-def: $sgpr0 -; GCN-O0-NEXT: ; implicit-def: $sgpr8 -; GCN-O0-NEXT: ; implicit-def: $sgpr0 -; GCN-O0-NEXT: ; implicit-def: $sgpr7 -; GCN-O0-NEXT: ; implicit-def: $sgpr0 -; GCN-O0-NEXT: ; implicit-def: $sgpr6 -; GCN-O0-NEXT: ; implicit-def: $sgpr0 -; GCN-O0-NEXT: ; implicit-def: $sgpr1 -; GCN-O0-NEXT: ; implicit-def: $sgpr0 -; GCN-O0-NEXT: ; implicit-def: $sgpr0 -; GCN-O0-NEXT: ; implicit-def: $sgpr20 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s19 -; GCN-O0-NEXT: v_mov_b32_e32 v30, s18 -; GCN-O0-NEXT: v_mov_b32_e32 v29, s17 -; GCN-O0-NEXT: v_mov_b32_e32 v28, s16 -; GCN-O0-NEXT: v_mov_b32_e32 v27, s15 -; GCN-O0-NEXT: v_mov_b32_e32 v26, s14 -; GCN-O0-NEXT: v_mov_b32_e32 v25, s13 -; GCN-O0-NEXT: v_mov_b32_e32 v24, s12 -; GCN-O0-NEXT: v_mov_b32_e32 v23, s11 -; GCN-O0-NEXT: v_mov_b32_e32 v22, s10 -; GCN-O0-NEXT: v_mov_b32_e32 v21, s9 -; GCN-O0-NEXT: v_mov_b32_e32 v20, s8 -; GCN-O0-NEXT: v_mov_b32_e32 v19, s7 -; GCN-O0-NEXT: v_mov_b32_e32 v18, s6 +; GCN-O0-NEXT: s_mov_b32 s12, s19 +; GCN-O0-NEXT: s_mov_b32 s0, s18 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s12 +; GCN-O0-NEXT: s_mov_b32 s12, s1 +; GCN-O0-NEXT: s_mov_b32 s13, s0 +; GCN-O0-NEXT: s_mov_b32 s14, s17 +; GCN-O0-NEXT: s_mov_b32 s0, s16 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s14 +; GCN-O0-NEXT: s_mov_b32 s14, s1 +; GCN-O0-NEXT: s_mov_b32 s15, s0 +; GCN-O0-NEXT: ; implicit-def: $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s0, s1 +; GCN-O0-NEXT: ; implicit-def: $sgpr16_sgpr17 +; GCN-O0-NEXT: s_mov_b32 s1, s16 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s15 +; GCN-O0-NEXT: v_mov_b32_e32 v30, s14 +; GCN-O0-NEXT: v_mov_b32_e32 v29, s13 +; GCN-O0-NEXT: v_mov_b32_e32 v28, s12 +; GCN-O0-NEXT: v_mov_b32_e32 v27, s11 +; GCN-O0-NEXT: v_mov_b32_e32 v26, s10 +; GCN-O0-NEXT: v_mov_b32_e32 v25, s9 +; GCN-O0-NEXT: v_mov_b32_e32 v24, s8 +; GCN-O0-NEXT: v_mov_b32_e32 v23, s7 +; GCN-O0-NEXT: v_mov_b32_e32 v22, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v21, s1 +; GCN-O0-NEXT: v_mov_b32_e32 v20, s0 +; GCN-O0-NEXT: v_mov_b32_e32 v19, s1 +; GCN-O0-NEXT: v_mov_b32_e32 v18, s0 ; GCN-O0-NEXT: v_mov_b32_e32 v17, s1 ; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 ; GCN-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16 killed $exec @@ -1914,30 +2251,60 @@ define amdgpu_kernel void @double5_inselt(ptr addrspace(1) %out, <5 x double> %v ; GCN-O0-NEXT: v_mov_b32_e32 v15, v17 ; GCN-O0-NEXT: v_mov_b32_e32 v16, v0 ; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0xa4 -; GCN-O0-NEXT: s_mov_b32 s1, 1 +; GCN-O0-NEXT: s_mov_b32 s1, 2 ; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: s_lshl_b32 s0, s0, s1 +; GCN-O0-NEXT: s_mul_i32 s0, s0, s1 ; GCN-O0-NEXT: s_mov_b64 s[4:5], 1.0 ; GCN-O0-NEXT: s_mov_b32 s1, s4 ; GCN-O0-NEXT: v_mov_b32_e32 v0, s1 ; GCN-O0-NEXT: s_mov_b32 m0, s0 ; GCN-O0-NEXT: v_movreld_b32_e32 v1, v0 +; GCN-O0-NEXT: s_mov_b32 s1, 1 +; GCN-O0-NEXT: s_add_i32 s0, s0, s1 ; GCN-O0-NEXT: s_mov_b32 s1, s5 ; GCN-O0-NEXT: v_mov_b32_e32 v0, s1 ; GCN-O0-NEXT: s_mov_b32 m0, s0 -; GCN-O0-NEXT: v_movreld_b32_e32 v2, v0 -; GCN-O0-NEXT: v_mov_b32_e32 v0, v4 -; GCN-O0-NEXT: v_mov_b32_e32 v17, v3 -; GCN-O0-NEXT: v_mov_b32_e32 v18, v2 +; GCN-O0-NEXT: v_movreld_b32_e32 v1, v0 +; GCN-O0-NEXT: v_mov_b32_e32 v0, v8 +; GCN-O0-NEXT: v_mov_b32_e32 v17, v7 +; GCN-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v18, v0 +; GCN-O0-NEXT: v_mov_b32_e32 v0, v18 +; GCN-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 killed $vgpr17_vgpr18 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v18, v6 +; GCN-O0-NEXT: v_mov_b32_e32 v19, v5 +; GCN-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v20, v18 +; GCN-O0-NEXT: v_mov_b32_e32 v18, v20 +; GCN-O0-NEXT: v_mov_b32_e32 v27, v19 +; GCN-O0-NEXT: v_mov_b32_e32 v21, v4 +; GCN-O0-NEXT: v_mov_b32_e32 v19, v3 +; GCN-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v20, v21 +; GCN-O0-NEXT: v_mov_b32_e32 v28, v20 +; GCN-O0-NEXT: v_mov_b32_e32 v29, v19 +; GCN-O0-NEXT: v_mov_b32_e32 v21, v2 ; GCN-O0-NEXT: v_mov_b32_e32 v19, v1 -; GCN-O0-NEXT: v_mov_b32_e32 v20, v8 -; GCN-O0-NEXT: v_mov_b32_e32 v21, v7 -; GCN-O0-NEXT: v_mov_b32_e32 v26, v6 -; GCN-O0-NEXT: v_mov_b32_e32 v22, v5 -; GCN-O0-NEXT: ; kill: def $vgpr22 killed $vgpr22 def $vgpr22_vgpr23_vgpr24_vgpr25 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v23, v26 -; GCN-O0-NEXT: v_mov_b32_e32 v24, v21 -; GCN-O0-NEXT: v_mov_b32_e32 v25, v20 +; GCN-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v20, v21 +; GCN-O0-NEXT: v_mov_b32_e32 v30, v20 +; GCN-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 killed $vgpr19_vgpr20 killed $exec +; GCN-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v20, v30 +; GCN-O0-NEXT: v_mov_b32_e32 v21, v29 +; GCN-O0-NEXT: v_mov_b32_e32 v22, v28 +; GCN-O0-NEXT: v_mov_b32_e32 v23, v27 +; GCN-O0-NEXT: v_mov_b32_e32 v24, v18 +; GCN-O0-NEXT: v_mov_b32_e32 v25, v17 +; GCN-O0-NEXT: v_mov_b32_e32 v26, v0 +; GCN-O0-NEXT: v_mov_b32_e32 v0, v26 +; GCN-O0-NEXT: v_mov_b32_e32 v17, v25 +; GCN-O0-NEXT: v_mov_b32_e32 v18, v24 +; GCN-O0-NEXT: v_mov_b32_e32 v27, v23 +; GCN-O0-NEXT: ; kill: def $vgpr27 killed $vgpr27 def $vgpr27_vgpr28_vgpr29_vgpr30 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v28, v18 +; GCN-O0-NEXT: v_mov_b32_e32 v29, v17 +; GCN-O0-NEXT: v_mov_b32_e32 v30, v0 ; GCN-O0-NEXT: s_mov_b64 s[6:7], 16 ; GCN-O0-NEXT: s_mov_b32 s0, s2 ; GCN-O0-NEXT: s_mov_b32 s1, s3 @@ -1947,9 +2314,13 @@ define amdgpu_kernel void @double5_inselt(ptr addrspace(1) %out, <5 x double> %v ; GCN-O0-NEXT: s_addc_u32 s4, s1, s4 ; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; GCN-O0-NEXT: s_mov_b32 s1, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v21, s1 -; GCN-O0-NEXT: v_mov_b32_e32 v20, s0 -; GCN-O0-NEXT: flat_store_dwordx4 v[20:21], v[22:25] +; GCN-O0-NEXT: v_mov_b32_e32 v18, s1 +; GCN-O0-NEXT: v_mov_b32_e32 v17, s0 +; GCN-O0-NEXT: flat_store_dwordx4 v[17:18], v[27:30] +; GCN-O0-NEXT: v_mov_b32_e32 v0, v22 +; GCN-O0-NEXT: v_mov_b32_e32 v17, v21 +; GCN-O0-NEXT: v_mov_b32_e32 v18, v20 +; GCN-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 killed $vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26 killed $exec ; GCN-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20_vgpr21_vgpr22 killed $exec ; GCN-O0-NEXT: v_mov_b32_e32 v20, v18 ; GCN-O0-NEXT: v_mov_b32_e32 v21, v17 @@ -2032,12 +2403,17 @@ define amdgpu_kernel void @double8_inselt(ptr addrspace(1) %out, <8 x double> %v ; ; GCN-O0-LABEL: double8_inselt: ; GCN-O0: ; %bb.0: ; %entry +; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0xa4 +; GCN-O0-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) ; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-O0-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; GCN-O0-NEXT: s_load_dword s2, s[4:5], 0xa4 -; GCN-O0-NEXT: s_mov_b32 s3, 1 +; GCN-O0-NEXT: s_mov_b32 s3, 2 ; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: s_lshl_b32 s2, s2, s3 +; GCN-O0-NEXT: s_mul_i32 s2, s2, s3 ; GCN-O0-NEXT: s_mov_b64 s[4:5], 1.0 ; GCN-O0-NEXT: s_mov_b32 s3, s4 ; GCN-O0-NEXT: v_mov_b32_e32 v7, s8 @@ -2059,10 +2435,12 @@ define amdgpu_kernel void @double8_inselt(ptr addrspace(1) %out, <8 x double> %v ; GCN-O0-NEXT: v_mov_b32_e32 v0, s3 ; GCN-O0-NEXT: s_mov_b32 m0, s2 ; GCN-O0-NEXT: v_movreld_b32_e32 v7, v0 +; GCN-O0-NEXT: s_mov_b32 s3, 1 +; GCN-O0-NEXT: s_add_i32 s2, s2, s3 ; GCN-O0-NEXT: s_mov_b32 s3, s5 ; GCN-O0-NEXT: v_mov_b32_e32 v0, s3 ; GCN-O0-NEXT: s_mov_b32 m0, s2 -; GCN-O0-NEXT: v_movreld_b32_e32 v8, v0 +; GCN-O0-NEXT: v_movreld_b32_e32 v7, v0 ; GCN-O0-NEXT: v_mov_b32_e32 v0, v22 ; GCN-O0-NEXT: v_mov_b32_e32 v1, v21 ; GCN-O0-NEXT: v_mov_b32_e32 v6, v20 @@ -2071,26 +2449,6 @@ define amdgpu_kernel void @double8_inselt(ptr addrspace(1) %out, <8 x double> %v ; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 ; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 ; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 48 -; GCN-O0-NEXT: s_mov_b32 s2, s0 -; GCN-O0-NEXT: s_mov_b32 s3, s1 -; GCN-O0-NEXT: s_mov_b32 s5, s6 -; GCN-O0-NEXT: s_mov_b32 s4, s7 -; GCN-O0-NEXT: s_add_u32 s2, s2, s5 -; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 -; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 -; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] -; GCN-O0-NEXT: v_mov_b32_e32 v0, v18 -; GCN-O0-NEXT: v_mov_b32_e32 v1, v17 -; GCN-O0-NEXT: v_mov_b32_e32 v6, v16 -; GCN-O0-NEXT: v_mov_b32_e32 v2, v15 -; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 -; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 -; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 ; GCN-O0-NEXT: s_mov_b64 s[6:7], 32 ; GCN-O0-NEXT: s_mov_b32 s2, s0 ; GCN-O0-NEXT: s_mov_b32 s3, s1 @@ -2100,6 +2458,26 @@ define amdgpu_kernel void @double8_inselt(ptr addrspace(1) %out, <8 x double> %v ; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 ; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 ; GCN-O0-NEXT: s_mov_b32 s3, s4 +; GCN-O0-NEXT: s_mov_b64 s[6:7], 16 +; GCN-O0-NEXT: s_mov_b32 s4, s2 +; GCN-O0-NEXT: s_mov_b32 s5, s3 +; GCN-O0-NEXT: s_mov_b32 s9, s6 +; GCN-O0-NEXT: s_mov_b32 s8, s7 +; GCN-O0-NEXT: s_add_u32 s4, s4, s9 +; GCN-O0-NEXT: s_addc_u32 s8, s5, s8 +; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GCN-O0-NEXT: s_mov_b32 s5, s8 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 +; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] +; GCN-O0-NEXT: v_mov_b32_e32 v0, v18 +; GCN-O0-NEXT: v_mov_b32_e32 v1, v17 +; GCN-O0-NEXT: v_mov_b32_e32 v6, v16 +; GCN-O0-NEXT: v_mov_b32_e32 v2, v15 +; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 +; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 +; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 ; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 ; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 ; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] @@ -2111,7 +2489,6 @@ define amdgpu_kernel void @double8_inselt(ptr addrspace(1) %out, <8 x double> %v ; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 ; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 ; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 16 ; GCN-O0-NEXT: s_mov_b32 s2, s0 ; GCN-O0-NEXT: s_mov_b32 s3, s1 ; GCN-O0-NEXT: s_mov_b32 s5, s6 @@ -2191,47 +2568,95 @@ define amdgpu_kernel void @double7_inselt(ptr addrspace(1) %out, <7 x double> %v ; ; GCN-O0-LABEL: double7_inselt: ; GCN-O0: ; %bb.0: ; %entry -; GCN-O0-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 -; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x94 +; GCN-O0-NEXT: s_mov_b64 s[0:1], s[4:5] +; GCN-O0-NEXT: s_load_dword s2, s[0:1], 0xa4 +; GCN-O0-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64 ; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: s_mov_b32 s6, s1 -; GCN-O0-NEXT: s_mov_b32 s7, s0 -; GCN-O0-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x84 +; GCN-O0-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: s_mov_b32 s8, s15 -; GCN-O0-NEXT: s_mov_b32 s9, s14 -; GCN-O0-NEXT: s_mov_b32 s10, s13 -; GCN-O0-NEXT: s_mov_b32 s11, s12 -; GCN-O0-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x64 +; GCN-O0-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-O0-NEXT: s_mov_b64 s[8:9], 0x64 +; GCN-O0-NEXT: s_mov_b32 s6, s0 +; GCN-O0-NEXT: s_mov_b32 s4, s1 +; GCN-O0-NEXT: s_mov_b32 s7, s8 +; GCN-O0-NEXT: s_mov_b32 s5, s9 +; GCN-O0-NEXT: s_add_u32 s8, s6, s7 +; GCN-O0-NEXT: s_addc_u32 s4, s4, s5 +; GCN-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9 +; GCN-O0-NEXT: s_mov_b32 s9, s4 +; GCN-O0-NEXT: s_load_dwordx4 s[12:15], s[8:9], 0x20 ; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: s_mov_b32 s12, s27 -; GCN-O0-NEXT: s_mov_b32 s13, s26 -; GCN-O0-NEXT: s_mov_b32 s14, s25 -; GCN-O0-NEXT: s_mov_b32 s15, s24 -; GCN-O0-NEXT: s_mov_b32 s16, s23 -; GCN-O0-NEXT: s_mov_b32 s17, s22 -; GCN-O0-NEXT: s_mov_b32 s18, s21 -; GCN-O0-NEXT: s_mov_b32 s19, s20 -; GCN-O0-NEXT: ; implicit-def: $sgpr1 -; GCN-O0-NEXT: ; implicit-def: $sgpr0 -; GCN-O0-NEXT: ; implicit-def: $sgpr0 -; GCN-O0-NEXT: ; implicit-def: $sgpr20 -; GCN-O0-NEXT: v_mov_b32_e32 v7, s19 -; GCN-O0-NEXT: v_mov_b32_e32 v30, s18 -; GCN-O0-NEXT: v_mov_b32_e32 v29, s17 -; GCN-O0-NEXT: v_mov_b32_e32 v28, s16 -; GCN-O0-NEXT: v_mov_b32_e32 v27, s15 -; GCN-O0-NEXT: v_mov_b32_e32 v26, s14 -; GCN-O0-NEXT: v_mov_b32_e32 v25, s13 -; GCN-O0-NEXT: v_mov_b32_e32 v24, s12 -; GCN-O0-NEXT: v_mov_b32_e32 v23, s11 -; GCN-O0-NEXT: v_mov_b32_e32 v6, s10 -; GCN-O0-NEXT: v_mov_b32_e32 v5, s9 -; GCN-O0-NEXT: v_mov_b32_e32 v4, s8 -; GCN-O0-NEXT: v_mov_b32_e32 v3, s7 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: s_mov_b32 s6, s15 +; GCN-O0-NEXT: s_mov_b32 s4, s14 +; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GCN-O0-NEXT: s_mov_b32 s5, s6 +; GCN-O0-NEXT: s_mov_b32 s10, s5 +; GCN-O0-NEXT: s_mov_b32 s11, s4 +; GCN-O0-NEXT: s_mov_b32 s6, s13 +; GCN-O0-NEXT: s_mov_b32 s4, s12 +; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GCN-O0-NEXT: s_mov_b32 s5, s6 +; GCN-O0-NEXT: s_mov_b32 s12, s5 +; GCN-O0-NEXT: s_mov_b32 s13, s4 +; GCN-O0-NEXT: s_mov_b64 s[4:5], 32 +; GCN-O0-NEXT: s_mov_b32 s6, s8 +; GCN-O0-NEXT: s_mov_b32 s7, s9 +; GCN-O0-NEXT: s_mov_b32 s9, s4 +; GCN-O0-NEXT: s_mov_b32 s8, s5 +; GCN-O0-NEXT: s_add_u32 s6, s6, s9 +; GCN-O0-NEXT: s_addc_u32 s8, s7, s8 +; GCN-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GCN-O0-NEXT: s_mov_b32 s7, s8 +; GCN-O0-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x10 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_mov_b32 s8, s7 +; GCN-O0-NEXT: s_mov_b32 s9, s6 +; GCN-O0-NEXT: s_load_dwordx8 s[24:31], s[0:1], 0x64 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_mov_b32 s14, s31 +; GCN-O0-NEXT: s_mov_b32 s6, s30 +; GCN-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GCN-O0-NEXT: s_mov_b32 s7, s14 +; GCN-O0-NEXT: s_mov_b32 s14, s7 +; GCN-O0-NEXT: s_mov_b32 s15, s6 +; GCN-O0-NEXT: s_mov_b32 s16, s29 +; GCN-O0-NEXT: s_mov_b32 s6, s28 +; GCN-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GCN-O0-NEXT: s_mov_b32 s7, s16 +; GCN-O0-NEXT: s_mov_b32 s16, s7 +; GCN-O0-NEXT: s_mov_b32 s17, s6 +; GCN-O0-NEXT: s_mov_b32 s18, s27 +; GCN-O0-NEXT: s_mov_b32 s6, s26 +; GCN-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GCN-O0-NEXT: s_mov_b32 s7, s18 +; GCN-O0-NEXT: s_mov_b32 s18, s7 +; GCN-O0-NEXT: s_mov_b32 s19, s6 +; GCN-O0-NEXT: s_mov_b32 s20, s25 +; GCN-O0-NEXT: s_mov_b32 s6, s24 +; GCN-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GCN-O0-NEXT: s_mov_b32 s7, s20 +; GCN-O0-NEXT: s_mov_b32 s20, s7 +; GCN-O0-NEXT: s_mov_b32 s21, s6 +; GCN-O0-NEXT: ; implicit-def: $sgpr6_sgpr7 +; GCN-O0-NEXT: s_mov_b32 s6, s7 +; GCN-O0-NEXT: ; implicit-def: $sgpr22_sgpr23 +; GCN-O0-NEXT: s_mov_b32 s7, s22 +; GCN-O0-NEXT: v_mov_b32_e32 v7, s21 +; GCN-O0-NEXT: v_mov_b32_e32 v30, s20 +; GCN-O0-NEXT: v_mov_b32_e32 v29, s19 +; GCN-O0-NEXT: v_mov_b32_e32 v28, s18 +; GCN-O0-NEXT: v_mov_b32_e32 v27, s17 +; GCN-O0-NEXT: v_mov_b32_e32 v26, s16 +; GCN-O0-NEXT: v_mov_b32_e32 v25, s15 +; GCN-O0-NEXT: v_mov_b32_e32 v24, s14 +; GCN-O0-NEXT: v_mov_b32_e32 v23, s13 +; GCN-O0-NEXT: v_mov_b32_e32 v6, s12 +; GCN-O0-NEXT: v_mov_b32_e32 v5, s11 +; GCN-O0-NEXT: v_mov_b32_e32 v4, s10 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s9 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s8 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s7 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s6 ; GCN-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22 killed $exec ; GCN-O0-NEXT: v_mov_b32_e32 v8, v30 ; GCN-O0-NEXT: v_mov_b32_e32 v9, v29 @@ -2248,47 +2673,89 @@ define amdgpu_kernel void @double7_inselt(ptr addrspace(1) %out, <7 x double> %v ; GCN-O0-NEXT: v_mov_b32_e32 v20, v2 ; GCN-O0-NEXT: v_mov_b32_e32 v21, v1 ; GCN-O0-NEXT: v_mov_b32_e32 v22, v0 -; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0xa4 -; GCN-O0-NEXT: s_mov_b32 s1, 1 +; GCN-O0-NEXT: s_load_dword s0, s[0:1], 0xa4 +; GCN-O0-NEXT: s_mov_b32 s1, 2 ; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: s_lshl_b32 s0, s0, s1 -; GCN-O0-NEXT: s_mov_b64 s[4:5], 1.0 -; GCN-O0-NEXT: s_mov_b32 s1, s4 +; GCN-O0-NEXT: s_mul_i32 s0, s0, s1 +; GCN-O0-NEXT: s_mov_b64 s[6:7], 1.0 +; GCN-O0-NEXT: s_mov_b32 s1, s6 ; GCN-O0-NEXT: v_mov_b32_e32 v0, s1 ; GCN-O0-NEXT: s_mov_b32 m0, s0 ; GCN-O0-NEXT: v_movreld_b32_e32 v7, v0 -; GCN-O0-NEXT: s_mov_b32 s1, s5 +; GCN-O0-NEXT: s_mov_b32 s1, 1 +; GCN-O0-NEXT: s_add_i32 s0, s0, s1 +; GCN-O0-NEXT: s_mov_b32 s1, s7 ; GCN-O0-NEXT: v_mov_b32_e32 v0, s1 ; GCN-O0-NEXT: s_mov_b32 m0, s0 -; GCN-O0-NEXT: v_movreld_b32_e32 v8, v0 +; GCN-O0-NEXT: v_movreld_b32_e32 v7, v0 ; GCN-O0-NEXT: v_mov_b32_e32 v0, v18 ; GCN-O0-NEXT: v_mov_b32_e32 v1, v17 -; GCN-O0-NEXT: v_mov_b32_e32 v6, v16 +; GCN-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v2, v0 +; GCN-O0-NEXT: v_mov_b32_e32 v0, v2 +; GCN-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $vgpr1_vgpr2 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v4, v16 ; GCN-O0-NEXT: v_mov_b32_e32 v2, v15 -; GCN-O0-NEXT: v_mov_b32_e32 v3, v10 -; GCN-O0-NEXT: v_mov_b32_e32 v4, v9 -; GCN-O0-NEXT: v_mov_b32_e32 v5, v8 +; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v3, v4 +; GCN-O0-NEXT: v_mov_b32_e32 v6, v3 +; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $vgpr2_vgpr3 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v3, v14 +; GCN-O0-NEXT: v_mov_b32_e32 v4, v13 +; GCN-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v5, v3 +; GCN-O0-NEXT: v_mov_b32_e32 v3, v5 +; GCN-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v5, v12 +; GCN-O0-NEXT: v_mov_b32_e32 v23, v11 +; GCN-O0-NEXT: ; kill: def $vgpr23 killed $vgpr23 def $vgpr23_vgpr24 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v24, v5 +; GCN-O0-NEXT: v_mov_b32_e32 v5, v24 +; GCN-O0-NEXT: v_mov_b32_e32 v31, v23 +; GCN-O0-NEXT: v_mov_b32_e32 v25, v10 +; GCN-O0-NEXT: v_mov_b32_e32 v23, v9 +; GCN-O0-NEXT: ; kill: def $vgpr23 killed $vgpr23 def $vgpr23_vgpr24 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v24, v25 +; GCN-O0-NEXT: v_mov_b32_e32 v32, v24 +; GCN-O0-NEXT: v_mov_b32_e32 v33, v23 +; GCN-O0-NEXT: v_mov_b32_e32 v25, v8 ; GCN-O0-NEXT: v_mov_b32_e32 v23, v7 -; GCN-O0-NEXT: v_mov_b32_e32 v24, v14 -; GCN-O0-NEXT: v_mov_b32_e32 v25, v13 -; GCN-O0-NEXT: v_mov_b32_e32 v30, v12 -; GCN-O0-NEXT: v_mov_b32_e32 v26, v11 -; GCN-O0-NEXT: ; kill: def $vgpr26 killed $vgpr26 def $vgpr26_vgpr27_vgpr28_vgpr29 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v27, v30 -; GCN-O0-NEXT: v_mov_b32_e32 v28, v25 -; GCN-O0-NEXT: v_mov_b32_e32 v29, v24 +; GCN-O0-NEXT: ; kill: def $vgpr23 killed $vgpr23 def $vgpr23_vgpr24 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v24, v25 +; GCN-O0-NEXT: v_mov_b32_e32 v34, v24 +; GCN-O0-NEXT: ; kill: def $vgpr23 killed $vgpr23 killed $vgpr23_vgpr24 killed $exec +; GCN-O0-NEXT: ; kill: def $vgpr23 killed $vgpr23 def $vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v24, v34 +; GCN-O0-NEXT: v_mov_b32_e32 v25, v33 +; GCN-O0-NEXT: v_mov_b32_e32 v26, v32 +; GCN-O0-NEXT: v_mov_b32_e32 v27, v31 +; GCN-O0-NEXT: v_mov_b32_e32 v28, v5 +; GCN-O0-NEXT: v_mov_b32_e32 v29, v4 +; GCN-O0-NEXT: v_mov_b32_e32 v30, v3 +; GCN-O0-NEXT: v_mov_b32_e32 v3, v30 +; GCN-O0-NEXT: v_mov_b32_e32 v4, v29 +; GCN-O0-NEXT: v_mov_b32_e32 v5, v28 +; GCN-O0-NEXT: v_mov_b32_e32 v31, v27 +; GCN-O0-NEXT: ; kill: def $vgpr31 killed $vgpr31 def $vgpr31_vgpr32_vgpr33_vgpr34 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v32, v5 +; GCN-O0-NEXT: v_mov_b32_e32 v33, v4 +; GCN-O0-NEXT: v_mov_b32_e32 v34, v3 ; GCN-O0-NEXT: s_mov_b64 s[6:7], 16 ; GCN-O0-NEXT: s_mov_b32 s0, s2 ; GCN-O0-NEXT: s_mov_b32 s1, s3 -; GCN-O0-NEXT: s_mov_b32 s5, s6 -; GCN-O0-NEXT: s_mov_b32 s4, s7 -; GCN-O0-NEXT: s_add_u32 s0, s0, s5 -; GCN-O0-NEXT: s_addc_u32 s4, s1, s4 +; GCN-O0-NEXT: s_mov_b32 s9, s6 +; GCN-O0-NEXT: s_mov_b32 s8, s7 +; GCN-O0-NEXT: s_add_u32 s0, s0, s9 +; GCN-O0-NEXT: s_addc_u32 s8, s1, s8 ; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 -; GCN-O0-NEXT: s_mov_b32 s1, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v25, s1 -; GCN-O0-NEXT: v_mov_b32_e32 v24, s0 -; GCN-O0-NEXT: flat_store_dwordx4 v[24:25], v[26:29] +; GCN-O0-NEXT: s_mov_b32 s1, s8 +; GCN-O0-NEXT: v_mov_b32_e32 v4, s1 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s0 +; GCN-O0-NEXT: flat_store_dwordx4 v[3:4], v[31:34] +; GCN-O0-NEXT: v_mov_b32_e32 v3, v26 +; GCN-O0-NEXT: v_mov_b32_e32 v4, v25 +; GCN-O0-NEXT: v_mov_b32_e32 v5, v24 +; GCN-O0-NEXT: ; kill: def $vgpr23 killed $vgpr23 killed $vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30 killed $exec ; GCN-O0-NEXT: ; kill: def $vgpr23 killed $vgpr23 def $vgpr23_vgpr24_vgpr25_vgpr26 killed $exec ; GCN-O0-NEXT: v_mov_b32_e32 v24, v5 ; GCN-O0-NEXT: v_mov_b32_e32 v25, v4 @@ -2300,23 +2767,6 @@ define amdgpu_kernel void @double7_inselt(ptr addrspace(1) %out, <7 x double> %v ; GCN-O0-NEXT: v_mov_b32_e32 v7, v19 ; GCN-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec ; GCN-O0-NEXT: v_mov_b32_e32 v8, v3 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 48 -; GCN-O0-NEXT: s_mov_b32 s0, s2 -; GCN-O0-NEXT: s_mov_b32 s1, s3 -; GCN-O0-NEXT: s_mov_b32 s5, s6 -; GCN-O0-NEXT: s_mov_b32 s4, s7 -; GCN-O0-NEXT: s_add_u32 s0, s0, s5 -; GCN-O0-NEXT: s_addc_u32 s4, s1, s4 -; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 -; GCN-O0-NEXT: s_mov_b32 s1, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v4, s1 -; GCN-O0-NEXT: v_mov_b32_e32 v3, s0 -; GCN-O0-NEXT: flat_store_dwordx2 v[3:4], v[7:8] -; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 -; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 -; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 -; GCN-O0-NEXT: s_mov_b64 s[4:5], 32 ; GCN-O0-NEXT: s_mov_b32 s0, s2 ; GCN-O0-NEXT: s_mov_b32 s1, s3 ; GCN-O0-NEXT: s_mov_b32 s3, s4 @@ -2325,6 +2775,21 @@ define amdgpu_kernel void @double7_inselt(ptr addrspace(1) %out, <7 x double> %v ; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 ; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; GCN-O0-NEXT: s_mov_b32 s1, s2 +; GCN-O0-NEXT: s_mov_b32 s2, s0 +; GCN-O0-NEXT: s_mov_b32 s3, s1 +; GCN-O0-NEXT: s_mov_b32 s5, s6 +; GCN-O0-NEXT: s_mov_b32 s4, s7 +; GCN-O0-NEXT: s_add_u32 s2, s2, s5 +; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 +; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; GCN-O0-NEXT: s_mov_b32 s3, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v4, s3 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s2 +; GCN-O0-NEXT: flat_store_dwordx2 v[3:4], v[7:8] +; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 +; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 +; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 ; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 ; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 ; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] @@ -2429,8 +2894,24 @@ define amdgpu_kernel void @double16_inselt(ptr addrspace(1) %out, <16 x double> ; ; GCN-O0-LABEL: double16_inselt: ; GCN-O0: ; %bb.0: ; %entry +; GCN-O0-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xa4 +; GCN-O0-NEXT: s_mov_b64 s[6:7], 0xa4 +; GCN-O0-NEXT: s_mov_b32 s2, s4 +; GCN-O0-NEXT: s_mov_b32 s0, s5 +; GCN-O0-NEXT: s_mov_b32 s3, s6 +; GCN-O0-NEXT: s_mov_b32 s1, s7 +; GCN-O0-NEXT: s_add_u32 s2, s2, s3 +; GCN-O0-NEXT: s_addc_u32 s0, s0, s1 +; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; GCN-O0-NEXT: s_mov_b32 s3, s0 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dwordx16 s[8:23], s[2:3], 0x40 +; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0x124 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) ; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GCN-O0-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xe4 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-O0-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0x40 ; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) ; GCN-O0-NEXT: s_mov_b32 s2, s51 ; GCN-O0-NEXT: s_mov_b32 s3, s50 @@ -2531,18 +3012,20 @@ define amdgpu_kernel void @double16_inselt(ptr addrspace(1) %out, <16 x double> ; GCN-O0-NEXT: v_mov_b32_e32 v37, v1 ; GCN-O0-NEXT: v_mov_b32_e32 v38, v0 ; GCN-O0-NEXT: s_load_dword s2, s[4:5], 0x124 -; GCN-O0-NEXT: s_mov_b32 s3, 1 +; GCN-O0-NEXT: s_mov_b32 s3, 2 ; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: s_lshl_b32 s2, s2, s3 +; GCN-O0-NEXT: s_mul_i32 s2, s2, s3 ; GCN-O0-NEXT: s_mov_b64 s[4:5], 1.0 ; GCN-O0-NEXT: s_mov_b32 s3, s4 ; GCN-O0-NEXT: v_mov_b32_e32 v0, s3 ; GCN-O0-NEXT: s_mov_b32 m0, s2 ; GCN-O0-NEXT: v_movreld_b32_e32 v7, v0 +; GCN-O0-NEXT: s_mov_b32 s3, 1 +; GCN-O0-NEXT: s_add_i32 s2, s2, s3 ; GCN-O0-NEXT: s_mov_b32 s3, s5 ; GCN-O0-NEXT: v_mov_b32_e32 v0, s3 ; GCN-O0-NEXT: s_mov_b32 m0, s2 -; GCN-O0-NEXT: v_movreld_b32_e32 v8, v0 +; GCN-O0-NEXT: v_movreld_b32_e32 v7, v0 ; GCN-O0-NEXT: v_mov_b32_e32 v0, v38 ; GCN-O0-NEXT: v_mov_b32_e32 v1, v37 ; GCN-O0-NEXT: v_mov_b32_e32 v6, v36 @@ -2551,66 +3034,6 @@ define amdgpu_kernel void @double16_inselt(ptr addrspace(1) %out, <16 x double> ; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 ; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 ; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 0x70 -; GCN-O0-NEXT: s_mov_b32 s2, s0 -; GCN-O0-NEXT: s_mov_b32 s3, s1 -; GCN-O0-NEXT: s_mov_b32 s5, s6 -; GCN-O0-NEXT: s_mov_b32 s4, s7 -; GCN-O0-NEXT: s_add_u32 s2, s2, s5 -; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 -; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 -; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] -; GCN-O0-NEXT: v_mov_b32_e32 v0, v34 -; GCN-O0-NEXT: v_mov_b32_e32 v1, v33 -; GCN-O0-NEXT: v_mov_b32_e32 v6, v32 -; GCN-O0-NEXT: v_mov_b32_e32 v2, v31 -; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 -; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 -; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 0x60 -; GCN-O0-NEXT: s_mov_b32 s2, s0 -; GCN-O0-NEXT: s_mov_b32 s3, s1 -; GCN-O0-NEXT: s_mov_b32 s5, s6 -; GCN-O0-NEXT: s_mov_b32 s4, s7 -; GCN-O0-NEXT: s_add_u32 s2, s2, s5 -; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 -; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 -; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] -; GCN-O0-NEXT: v_mov_b32_e32 v0, v30 -; GCN-O0-NEXT: v_mov_b32_e32 v1, v29 -; GCN-O0-NEXT: v_mov_b32_e32 v6, v28 -; GCN-O0-NEXT: v_mov_b32_e32 v2, v27 -; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 -; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 -; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 0x50 -; GCN-O0-NEXT: s_mov_b32 s2, s0 -; GCN-O0-NEXT: s_mov_b32 s3, s1 -; GCN-O0-NEXT: s_mov_b32 s5, s6 -; GCN-O0-NEXT: s_mov_b32 s4, s7 -; GCN-O0-NEXT: s_add_u32 s2, s2, s5 -; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 -; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 -; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] -; GCN-O0-NEXT: v_mov_b32_e32 v0, v26 -; GCN-O0-NEXT: v_mov_b32_e32 v1, v25 -; GCN-O0-NEXT: v_mov_b32_e32 v6, v24 -; GCN-O0-NEXT: v_mov_b32_e32 v2, v23 -; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 -; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 -; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 ; GCN-O0-NEXT: s_mov_b64 s[6:7], 64 ; GCN-O0-NEXT: s_mov_b32 s2, s0 ; GCN-O0-NEXT: s_mov_b32 s3, s1 @@ -2620,6 +3043,65 @@ define amdgpu_kernel void @double16_inselt(ptr addrspace(1) %out, <16 x double> ; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 ; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 ; GCN-O0-NEXT: s_mov_b32 s3, s4 +; GCN-O0-NEXT: s_mov_b64 s[8:9], 32 +; GCN-O0-NEXT: s_mov_b32 s4, s2 +; GCN-O0-NEXT: s_mov_b32 s5, s3 +; GCN-O0-NEXT: s_mov_b32 s7, s8 +; GCN-O0-NEXT: s_mov_b32 s6, s9 +; GCN-O0-NEXT: s_add_u32 s4, s4, s7 +; GCN-O0-NEXT: s_addc_u32 s6, s5, s6 +; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GCN-O0-NEXT: s_mov_b32 s5, s6 +; GCN-O0-NEXT: s_mov_b64 s[6:7], 16 +; GCN-O0-NEXT: s_mov_b32 s10, s4 +; GCN-O0-NEXT: s_mov_b32 s11, s5 +; GCN-O0-NEXT: s_mov_b32 s13, s6 +; GCN-O0-NEXT: s_mov_b32 s12, s7 +; GCN-O0-NEXT: s_add_u32 s10, s10, s13 +; GCN-O0-NEXT: s_addc_u32 s12, s11, s12 +; GCN-O0-NEXT: ; kill: def $sgpr10 killed $sgpr10 def $sgpr10_sgpr11 +; GCN-O0-NEXT: s_mov_b32 s11, s12 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s10 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s11 +; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] +; GCN-O0-NEXT: v_mov_b32_e32 v0, v34 +; GCN-O0-NEXT: v_mov_b32_e32 v1, v33 +; GCN-O0-NEXT: v_mov_b32_e32 v6, v32 +; GCN-O0-NEXT: v_mov_b32_e32 v2, v31 +; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 +; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 +; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 +; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] +; GCN-O0-NEXT: v_mov_b32_e32 v0, v30 +; GCN-O0-NEXT: v_mov_b32_e32 v1, v29 +; GCN-O0-NEXT: v_mov_b32_e32 v6, v28 +; GCN-O0-NEXT: v_mov_b32_e32 v2, v27 +; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 +; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 +; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 +; GCN-O0-NEXT: s_mov_b32 s4, s2 +; GCN-O0-NEXT: s_mov_b32 s5, s3 +; GCN-O0-NEXT: s_mov_b32 s11, s6 +; GCN-O0-NEXT: s_mov_b32 s10, s7 +; GCN-O0-NEXT: s_add_u32 s4, s4, s11 +; GCN-O0-NEXT: s_addc_u32 s10, s5, s10 +; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GCN-O0-NEXT: s_mov_b32 s5, s10 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 +; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] +; GCN-O0-NEXT: v_mov_b32_e32 v0, v26 +; GCN-O0-NEXT: v_mov_b32_e32 v1, v25 +; GCN-O0-NEXT: v_mov_b32_e32 v6, v24 +; GCN-O0-NEXT: v_mov_b32_e32 v2, v23 +; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 +; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 +; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 ; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 ; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 ; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] @@ -2631,17 +3113,24 @@ define amdgpu_kernel void @double16_inselt(ptr addrspace(1) %out, <16 x double> ; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 ; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 ; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 48 ; GCN-O0-NEXT: s_mov_b32 s2, s0 ; GCN-O0-NEXT: s_mov_b32 s3, s1 -; GCN-O0-NEXT: s_mov_b32 s5, s6 -; GCN-O0-NEXT: s_mov_b32 s4, s7 +; GCN-O0-NEXT: s_mov_b32 s5, s8 +; GCN-O0-NEXT: s_mov_b32 s4, s9 ; GCN-O0-NEXT: s_add_u32 s2, s2, s5 ; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 ; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 ; GCN-O0-NEXT: s_mov_b32 s3, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: s_mov_b32 s4, s2 +; GCN-O0-NEXT: s_mov_b32 s5, s3 +; GCN-O0-NEXT: s_mov_b32 s9, s6 +; GCN-O0-NEXT: s_mov_b32 s8, s7 +; GCN-O0-NEXT: s_add_u32 s4, s4, s9 +; GCN-O0-NEXT: s_addc_u32 s8, s5, s8 +; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GCN-O0-NEXT: s_mov_b32 s5, s8 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 ; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; GCN-O0-NEXT: v_mov_b32_e32 v0, v18 ; GCN-O0-NEXT: v_mov_b32_e32 v1, v17 @@ -2651,15 +3140,6 @@ define amdgpu_kernel void @double16_inselt(ptr addrspace(1) %out, <16 x double> ; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 ; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 ; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 32 -; GCN-O0-NEXT: s_mov_b32 s2, s0 -; GCN-O0-NEXT: s_mov_b32 s3, s1 -; GCN-O0-NEXT: s_mov_b32 s5, s6 -; GCN-O0-NEXT: s_mov_b32 s4, s7 -; GCN-O0-NEXT: s_add_u32 s2, s2, s5 -; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 -; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s4 ; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 ; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 ; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] @@ -2671,7 +3151,6 @@ define amdgpu_kernel void @double16_inselt(ptr addrspace(1) %out, <16 x double> ; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 ; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 ; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 16 ; GCN-O0-NEXT: s_mov_b32 s2, s0 ; GCN-O0-NEXT: s_mov_b32 s3, s1 ; GCN-O0-NEXT: s_mov_b32 s5, s6 @@ -2793,262 +3272,555 @@ define amdgpu_kernel void @double15_inselt(ptr addrspace(1) %out, <15 x double> ; ; GCN-O0-LABEL: double15_inselt: ; GCN-O0: ; %bb.0: ; %entry -; GCN-O0-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 -; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x114 +; GCN-O0-NEXT: s_mov_b32 s56, SCRATCH_RSRC_DWORD0 +; GCN-O0-NEXT: s_mov_b32 s57, SCRATCH_RSRC_DWORD1 +; GCN-O0-NEXT: s_mov_b32 s58, -1 +; GCN-O0-NEXT: s_mov_b32 s59, 0xe80000 +; GCN-O0-NEXT: s_add_u32 s56, s56, s11 +; GCN-O0-NEXT: s_addc_u32 s57, s57, 0 +; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5] +; GCN-O0-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0xa4 ; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: s_mov_b32 s6, s1 -; GCN-O0-NEXT: s_mov_b32 s7, s0 -; GCN-O0-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x104 +; GCN-O0-NEXT: s_mov_b64 s[6:7], 0xa4 +; GCN-O0-NEXT: s_mov_b32 s4, s2 +; GCN-O0-NEXT: s_mov_b32 s0, s3 +; GCN-O0-NEXT: s_mov_b32 s5, s6 +; GCN-O0-NEXT: s_mov_b32 s1, s7 +; GCN-O0-NEXT: s_add_u32 s4, s4, s5 +; GCN-O0-NEXT: s_addc_u32 s0, s0, s1 +; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GCN-O0-NEXT: s_mov_b32 s5, s0 +; GCN-O0-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 +; GCN-O0-NEXT: s_load_dword s0, s[2:3], 0x124 ; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: s_mov_b32 s8, s15 -; GCN-O0-NEXT: s_mov_b32 s9, s14 -; GCN-O0-NEXT: s_mov_b32 s10, s13 -; GCN-O0-NEXT: s_mov_b32 s11, s12 -; GCN-O0-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0xe4 +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: s_mov_b32 s12, s27 -; GCN-O0-NEXT: s_mov_b32 s13, s26 -; GCN-O0-NEXT: s_mov_b32 s14, s25 -; GCN-O0-NEXT: s_mov_b32 s15, s24 -; GCN-O0-NEXT: s_mov_b32 s16, s23 -; GCN-O0-NEXT: s_mov_b32 s17, s22 -; GCN-O0-NEXT: s_mov_b32 s18, s21 -; GCN-O0-NEXT: s_mov_b32 s19, s20 -; GCN-O0-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-O0-NEXT: s_mov_b64 s[8:9], 64 +; GCN-O0-NEXT: s_mov_b32 s10, s4 +; GCN-O0-NEXT: s_mov_b32 s6, s5 +; GCN-O0-NEXT: s_mov_b32 s11, s8 +; GCN-O0-NEXT: s_mov_b32 s7, s9 +; GCN-O0-NEXT: s_add_u32 s10, s10, s11 +; GCN-O0-NEXT: s_addc_u32 s6, s6, s7 +; GCN-O0-NEXT: ; kill: def $sgpr10 killed $sgpr10 def $sgpr10_sgpr11 +; GCN-O0-NEXT: s_mov_b32 s11, s6 +; GCN-O0-NEXT: s_load_dwordx4 s[16:19], s[10:11], 0x20 ; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: s_mov_b32 s20, s51 -; GCN-O0-NEXT: s_mov_b32 s21, s50 -; GCN-O0-NEXT: s_mov_b32 s22, s49 -; GCN-O0-NEXT: s_mov_b32 s23, s48 -; GCN-O0-NEXT: s_mov_b32 s24, s47 -; GCN-O0-NEXT: s_mov_b32 s25, s46 -; GCN-O0-NEXT: s_mov_b32 s26, s45 -; GCN-O0-NEXT: s_mov_b32 s27, s44 -; GCN-O0-NEXT: s_mov_b32 s28, s43 -; GCN-O0-NEXT: s_mov_b32 s29, s42 -; GCN-O0-NEXT: s_mov_b32 s30, s41 -; GCN-O0-NEXT: s_mov_b32 s31, s40 -; GCN-O0-NEXT: s_mov_b32 s33, s39 -; GCN-O0-NEXT: s_mov_b32 s34, s38 -; GCN-O0-NEXT: s_mov_b32 s35, s37 -; GCN-O0-NEXT: ; kill: def $sgpr36 killed $sgpr36 killed $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 -; GCN-O0-NEXT: ; implicit-def: $sgpr1 -; GCN-O0-NEXT: ; implicit-def: $sgpr0 -; GCN-O0-NEXT: ; implicit-def: $sgpr0 -; GCN-O0-NEXT: ; implicit-def: $sgpr37 -; GCN-O0-NEXT: v_mov_b32_e32 v7, s36 -; GCN-O0-NEXT: v_mov_b32_e32 v62, s35 -; GCN-O0-NEXT: v_mov_b32_e32 v61, s34 -; GCN-O0-NEXT: v_mov_b32_e32 v60, s33 -; GCN-O0-NEXT: v_mov_b32_e32 v59, s31 -; GCN-O0-NEXT: v_mov_b32_e32 v58, s30 -; GCN-O0-NEXT: v_mov_b32_e32 v57, s29 -; GCN-O0-NEXT: v_mov_b32_e32 v56, s28 -; GCN-O0-NEXT: v_mov_b32_e32 v55, s27 -; GCN-O0-NEXT: v_mov_b32_e32 v54, s26 -; GCN-O0-NEXT: v_mov_b32_e32 v53, s25 -; GCN-O0-NEXT: v_mov_b32_e32 v52, s24 -; GCN-O0-NEXT: v_mov_b32_e32 v51, s23 -; GCN-O0-NEXT: v_mov_b32_e32 v50, s22 -; GCN-O0-NEXT: v_mov_b32_e32 v49, s21 -; GCN-O0-NEXT: v_mov_b32_e32 v48, s20 -; GCN-O0-NEXT: v_mov_b32_e32 v47, s19 -; GCN-O0-NEXT: v_mov_b32_e32 v46, s18 -; GCN-O0-NEXT: v_mov_b32_e32 v45, s17 -; GCN-O0-NEXT: v_mov_b32_e32 v44, s16 -; GCN-O0-NEXT: v_mov_b32_e32 v43, s15 -; GCN-O0-NEXT: v_mov_b32_e32 v42, s14 -; GCN-O0-NEXT: v_mov_b32_e32 v41, s13 -; GCN-O0-NEXT: v_mov_b32_e32 v40, s12 -; GCN-O0-NEXT: v_mov_b32_e32 v39, s11 -; GCN-O0-NEXT: v_mov_b32_e32 v6, s10 -; GCN-O0-NEXT: v_mov_b32_e32 v5, s9 -; GCN-O0-NEXT: v_mov_b32_e32 v4, s8 -; GCN-O0-NEXT: v_mov_b32_e32 v3, s7 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 -; GCN-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v8, v62 -; GCN-O0-NEXT: v_mov_b32_e32 v9, v61 -; GCN-O0-NEXT: v_mov_b32_e32 v10, v60 -; GCN-O0-NEXT: v_mov_b32_e32 v11, v59 -; GCN-O0-NEXT: v_mov_b32_e32 v12, v58 -; GCN-O0-NEXT: v_mov_b32_e32 v13, v57 -; GCN-O0-NEXT: v_mov_b32_e32 v14, v56 -; GCN-O0-NEXT: v_mov_b32_e32 v15, v55 -; GCN-O0-NEXT: v_mov_b32_e32 v16, v54 -; GCN-O0-NEXT: v_mov_b32_e32 v17, v53 -; GCN-O0-NEXT: v_mov_b32_e32 v18, v52 -; GCN-O0-NEXT: v_mov_b32_e32 v19, v51 -; GCN-O0-NEXT: v_mov_b32_e32 v20, v50 -; GCN-O0-NEXT: v_mov_b32_e32 v21, v49 -; GCN-O0-NEXT: v_mov_b32_e32 v22, v48 -; GCN-O0-NEXT: v_mov_b32_e32 v23, v47 -; GCN-O0-NEXT: v_mov_b32_e32 v24, v46 -; GCN-O0-NEXT: v_mov_b32_e32 v25, v45 -; GCN-O0-NEXT: v_mov_b32_e32 v26, v44 -; GCN-O0-NEXT: v_mov_b32_e32 v27, v43 -; GCN-O0-NEXT: v_mov_b32_e32 v28, v42 -; GCN-O0-NEXT: v_mov_b32_e32 v29, v41 -; GCN-O0-NEXT: v_mov_b32_e32 v30, v40 -; GCN-O0-NEXT: v_mov_b32_e32 v31, v39 +; GCN-O0-NEXT: s_mov_b32 s12, s19 +; GCN-O0-NEXT: s_mov_b32 s6, s18 +; GCN-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GCN-O0-NEXT: s_mov_b32 s7, s12 +; GCN-O0-NEXT: s_mov_b32 s12, s7 +; GCN-O0-NEXT: s_mov_b32 s13, s6 +; GCN-O0-NEXT: s_mov_b32 s14, s17 +; GCN-O0-NEXT: s_mov_b32 s6, s16 +; GCN-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GCN-O0-NEXT: s_mov_b32 s7, s14 +; GCN-O0-NEXT: s_mov_b32 s14, s7 +; GCN-O0-NEXT: s_mov_b32 s15, s6 +; GCN-O0-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x40 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_mov_b32 s6, s31 +; GCN-O0-NEXT: s_mov_b32 s4, s30 +; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GCN-O0-NEXT: s_mov_b32 s5, s6 +; GCN-O0-NEXT: s_mov_b32 s16, s5 +; GCN-O0-NEXT: s_mov_b32 s17, s4 +; GCN-O0-NEXT: s_mov_b32 s6, s29 +; GCN-O0-NEXT: s_mov_b32 s4, s28 +; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GCN-O0-NEXT: s_mov_b32 s5, s6 +; GCN-O0-NEXT: s_mov_b32 s18, s5 +; GCN-O0-NEXT: s_mov_b32 s19, s4 +; GCN-O0-NEXT: s_mov_b32 s6, s27 +; GCN-O0-NEXT: s_mov_b32 s4, s26 +; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GCN-O0-NEXT: s_mov_b32 s5, s6 +; GCN-O0-NEXT: s_mov_b32 s20, s5 +; GCN-O0-NEXT: s_mov_b32 s21, s4 +; GCN-O0-NEXT: s_mov_b32 s6, s25 +; GCN-O0-NEXT: s_mov_b32 s4, s24 +; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GCN-O0-NEXT: s_mov_b32 s5, s6 +; GCN-O0-NEXT: s_mov_b32 s22, s5 +; GCN-O0-NEXT: s_mov_b32 s23, s4 +; GCN-O0-NEXT: s_mov_b64 s[4:5], 32 +; GCN-O0-NEXT: s_mov_b32 s6, s10 +; GCN-O0-NEXT: s_mov_b32 s7, s11 +; GCN-O0-NEXT: s_mov_b32 s11, s4 +; GCN-O0-NEXT: s_mov_b32 s10, s5 +; GCN-O0-NEXT: s_add_u32 s6, s6, s11 +; GCN-O0-NEXT: s_addc_u32 s10, s7, s10 +; GCN-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GCN-O0-NEXT: s_mov_b32 s7, s10 +; GCN-O0-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x10 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_mov_b32 s10, s7 +; GCN-O0-NEXT: s_mov_b32 s11, s6 +; GCN-O0-NEXT: s_load_dwordx16 s[40:55], s[2:3], 0xa4 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_mov_b32 s24, s55 +; GCN-O0-NEXT: s_mov_b32 s6, s54 +; GCN-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GCN-O0-NEXT: s_mov_b32 s7, s24 +; GCN-O0-NEXT: s_mov_b32 s24, s7 +; GCN-O0-NEXT: s_mov_b32 s25, s6 +; GCN-O0-NEXT: s_mov_b32 s26, s53 +; GCN-O0-NEXT: s_mov_b32 s6, s52 +; GCN-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GCN-O0-NEXT: s_mov_b32 s7, s26 +; GCN-O0-NEXT: s_mov_b32 s26, s7 +; GCN-O0-NEXT: s_mov_b32 s27, s6 +; GCN-O0-NEXT: s_mov_b32 s28, s51 +; GCN-O0-NEXT: s_mov_b32 s6, s50 +; GCN-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GCN-O0-NEXT: s_mov_b32 s7, s28 +; GCN-O0-NEXT: s_mov_b32 s28, s7 +; GCN-O0-NEXT: s_mov_b32 s29, s6 +; GCN-O0-NEXT: s_mov_b32 s30, s49 +; GCN-O0-NEXT: s_mov_b32 s6, s48 +; GCN-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GCN-O0-NEXT: s_mov_b32 s7, s30 +; GCN-O0-NEXT: s_mov_b32 s30, s7 +; GCN-O0-NEXT: s_mov_b32 s31, s6 +; GCN-O0-NEXT: s_mov_b32 s33, s47 +; GCN-O0-NEXT: s_mov_b32 s6, s46 +; GCN-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GCN-O0-NEXT: s_mov_b32 s7, s33 +; GCN-O0-NEXT: s_mov_b32 s33, s7 +; GCN-O0-NEXT: s_mov_b32 s34, s6 +; GCN-O0-NEXT: s_mov_b32 s35, s45 +; GCN-O0-NEXT: s_mov_b32 s6, s44 +; GCN-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GCN-O0-NEXT: s_mov_b32 s7, s35 +; GCN-O0-NEXT: s_mov_b32 s35, s7 +; GCN-O0-NEXT: s_mov_b32 s36, s6 +; GCN-O0-NEXT: s_mov_b32 s37, s43 +; GCN-O0-NEXT: s_mov_b32 s6, s42 +; GCN-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GCN-O0-NEXT: s_mov_b32 s7, s37 +; GCN-O0-NEXT: s_mov_b32 s37, s7 +; GCN-O0-NEXT: s_mov_b32 s38, s6 +; GCN-O0-NEXT: s_mov_b32 s39, s41 +; GCN-O0-NEXT: s_mov_b32 s6, s40 +; GCN-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GCN-O0-NEXT: s_mov_b32 s7, s39 +; GCN-O0-NEXT: s_mov_b32 s39, s7 +; GCN-O0-NEXT: s_mov_b32 s40, s6 +; GCN-O0-NEXT: ; implicit-def: $sgpr6_sgpr7 +; GCN-O0-NEXT: s_mov_b32 s6, s7 +; GCN-O0-NEXT: ; implicit-def: $sgpr42_sgpr43 +; GCN-O0-NEXT: s_mov_b32 s7, s42 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s40 +; GCN-O0-NEXT: v_mov_b32_e32 v62, s39 +; GCN-O0-NEXT: v_mov_b32_e32 v61, s38 +; GCN-O0-NEXT: v_mov_b32_e32 v60, s37 +; GCN-O0-NEXT: v_mov_b32_e32 v59, s36 +; GCN-O0-NEXT: v_mov_b32_e32 v58, s35 +; GCN-O0-NEXT: v_mov_b32_e32 v57, s34 +; GCN-O0-NEXT: v_mov_b32_e32 v56, s33 +; GCN-O0-NEXT: v_mov_b32_e32 v55, s31 +; GCN-O0-NEXT: v_mov_b32_e32 v54, s30 +; GCN-O0-NEXT: v_mov_b32_e32 v53, s29 +; GCN-O0-NEXT: v_mov_b32_e32 v52, s28 +; GCN-O0-NEXT: v_mov_b32_e32 v51, s27 +; GCN-O0-NEXT: v_mov_b32_e32 v50, s26 +; GCN-O0-NEXT: v_mov_b32_e32 v49, s25 +; GCN-O0-NEXT: v_mov_b32_e32 v48, s24 +; GCN-O0-NEXT: v_mov_b32_e32 v47, s23 +; GCN-O0-NEXT: v_mov_b32_e32 v46, s22 +; GCN-O0-NEXT: v_mov_b32_e32 v45, s21 +; GCN-O0-NEXT: v_mov_b32_e32 v44, s20 +; GCN-O0-NEXT: v_mov_b32_e32 v43, s19 +; GCN-O0-NEXT: v_mov_b32_e32 v42, s18 +; GCN-O0-NEXT: v_mov_b32_e32 v41, s17 +; GCN-O0-NEXT: v_mov_b32_e32 v40, s16 +; GCN-O0-NEXT: v_mov_b32_e32 v39, s15 +; GCN-O0-NEXT: v_mov_b32_e32 v38, s14 +; GCN-O0-NEXT: v_mov_b32_e32 v37, s13 +; GCN-O0-NEXT: v_mov_b32_e32 v36, s12 +; GCN-O0-NEXT: v_mov_b32_e32 v35, s11 +; GCN-O0-NEXT: v_mov_b32_e32 v34, s10 +; GCN-O0-NEXT: v_mov_b32_e32 v33, s7 +; GCN-O0-NEXT: v_mov_b32_e32 v32, s6 +; GCN-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v1, v62 +; GCN-O0-NEXT: v_mov_b32_e32 v2, v61 +; GCN-O0-NEXT: v_mov_b32_e32 v3, v60 +; GCN-O0-NEXT: v_mov_b32_e32 v4, v59 +; GCN-O0-NEXT: v_mov_b32_e32 v5, v58 +; GCN-O0-NEXT: v_mov_b32_e32 v6, v57 +; GCN-O0-NEXT: v_mov_b32_e32 v7, v56 +; GCN-O0-NEXT: v_mov_b32_e32 v8, v55 +; GCN-O0-NEXT: v_mov_b32_e32 v9, v54 +; GCN-O0-NEXT: v_mov_b32_e32 v10, v53 +; GCN-O0-NEXT: v_mov_b32_e32 v11, v52 +; GCN-O0-NEXT: v_mov_b32_e32 v12, v51 +; GCN-O0-NEXT: v_mov_b32_e32 v13, v50 +; GCN-O0-NEXT: v_mov_b32_e32 v14, v49 +; GCN-O0-NEXT: v_mov_b32_e32 v15, v48 +; GCN-O0-NEXT: v_mov_b32_e32 v16, v47 +; GCN-O0-NEXT: v_mov_b32_e32 v17, v46 +; GCN-O0-NEXT: v_mov_b32_e32 v18, v45 +; GCN-O0-NEXT: v_mov_b32_e32 v19, v44 +; GCN-O0-NEXT: v_mov_b32_e32 v20, v43 +; GCN-O0-NEXT: v_mov_b32_e32 v21, v42 +; GCN-O0-NEXT: v_mov_b32_e32 v22, v41 +; GCN-O0-NEXT: v_mov_b32_e32 v23, v40 +; GCN-O0-NEXT: v_mov_b32_e32 v24, v39 +; GCN-O0-NEXT: v_mov_b32_e32 v25, v38 +; GCN-O0-NEXT: v_mov_b32_e32 v26, v37 +; GCN-O0-NEXT: v_mov_b32_e32 v27, v36 +; GCN-O0-NEXT: v_mov_b32_e32 v28, v35 +; GCN-O0-NEXT: v_mov_b32_e32 v29, v34 +; GCN-O0-NEXT: v_mov_b32_e32 v30, v33 +; GCN-O0-NEXT: v_mov_b32_e32 v31, v32 +; GCN-O0-NEXT: s_load_dword s2, s[2:3], 0x124 +; GCN-O0-NEXT: s_mov_b32 s3, 2 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_mul_i32 s2, s2, s3 +; GCN-O0-NEXT: s_mov_b64 s[6:7], 1.0 +; GCN-O0-NEXT: s_mov_b32 s3, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v32, s3 +; GCN-O0-NEXT: s_mov_b32 m0, s2 +; GCN-O0-NEXT: v_movreld_b32_e32 v0, v32 +; GCN-O0-NEXT: s_mov_b32 s3, 1 +; GCN-O0-NEXT: s_add_i32 s2, s2, s3 +; GCN-O0-NEXT: s_mov_b32 s3, s7 +; GCN-O0-NEXT: v_mov_b32_e32 v32, s3 +; GCN-O0-NEXT: s_mov_b32 m0, s2 +; GCN-O0-NEXT: v_movreld_b32_e32 v0, v32 +; GCN-O0-NEXT: buffer_store_dword v0, off, s[56:59], 0 offset:28 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v1, off, s[56:59], 0 offset:32 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v2, off, s[56:59], 0 offset:36 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v3, off, s[56:59], 0 offset:40 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v4, off, s[56:59], 0 offset:44 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v5, off, s[56:59], 0 offset:48 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v6, off, s[56:59], 0 offset:52 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v7, off, s[56:59], 0 offset:56 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v8, off, s[56:59], 0 offset:60 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v9, off, s[56:59], 0 offset:64 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v10, off, s[56:59], 0 offset:68 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v11, off, s[56:59], 0 offset:72 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v12, off, s[56:59], 0 offset:76 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v13, off, s[56:59], 0 offset:80 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v14, off, s[56:59], 0 offset:84 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v15, off, s[56:59], 0 offset:88 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v16, off, s[56:59], 0 offset:92 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v17, off, s[56:59], 0 offset:96 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v18, off, s[56:59], 0 offset:100 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v19, off, s[56:59], 0 offset:104 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v20, off, s[56:59], 0 offset:108 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v21, off, s[56:59], 0 offset:112 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v22, off, s[56:59], 0 offset:116 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v23, off, s[56:59], 0 offset:120 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v24, off, s[56:59], 0 offset:124 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v25, off, s[56:59], 0 offset:128 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v26, off, s[56:59], 0 offset:132 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v27, off, s[56:59], 0 offset:136 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v28, off, s[56:59], 0 offset:140 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v29, off, s[56:59], 0 offset:144 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v30, off, s[56:59], 0 offset:148 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v31, off, s[56:59], 0 offset:152 ; 4-byte Folded Spill +; GCN-O0-NEXT: v_mov_b32_e32 v34, v27 +; GCN-O0-NEXT: v_mov_b32_e32 v32, v26 +; GCN-O0-NEXT: ; kill: def $vgpr32 killed $vgpr32 def $vgpr32_vgpr33 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v33, v34 +; GCN-O0-NEXT: v_mov_b32_e32 v34, v33 +; GCN-O0-NEXT: buffer_store_dword v34, off, s[56:59], 0 ; 4-byte Folded Spill +; GCN-O0-NEXT: ; kill: def $vgpr32 killed $vgpr32 killed $vgpr32_vgpr33 killed $exec +; GCN-O0-NEXT: buffer_store_dword v32, off, s[56:59], 0 offset:4 ; 4-byte Folded Spill +; GCN-O0-NEXT: v_mov_b32_e32 v34, v25 +; GCN-O0-NEXT: v_mov_b32_e32 v32, v24 +; GCN-O0-NEXT: ; kill: def $vgpr32 killed $vgpr32 def $vgpr32_vgpr33 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v33, v34 +; GCN-O0-NEXT: v_mov_b32_e32 v34, v33 +; GCN-O0-NEXT: buffer_store_dword v34, off, s[56:59], 0 offset:24 ; 4-byte Folded Spill +; GCN-O0-NEXT: ; kill: def $vgpr32 killed $vgpr32 killed $vgpr32_vgpr33 killed $exec +; GCN-O0-NEXT: buffer_store_dword v32, off, s[56:59], 0 offset:8 ; 4-byte Folded Spill +; GCN-O0-NEXT: v_mov_b32_e32 v34, v23 +; GCN-O0-NEXT: v_mov_b32_e32 v32, v22 +; GCN-O0-NEXT: ; kill: def $vgpr32 killed $vgpr32 def $vgpr32_vgpr33 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v33, v34 +; GCN-O0-NEXT: v_mov_b32_e32 v34, v33 +; GCN-O0-NEXT: buffer_store_dword v34, off, s[56:59], 0 offset:12 ; 4-byte Folded Spill +; GCN-O0-NEXT: ; kill: def $vgpr32 killed $vgpr32 killed $vgpr32_vgpr33 killed $exec +; GCN-O0-NEXT: buffer_store_dword v32, off, s[56:59], 0 offset:16 ; 4-byte Folded Spill +; GCN-O0-NEXT: v_mov_b32_e32 v34, v21 +; GCN-O0-NEXT: v_mov_b32_e32 v32, v20 +; GCN-O0-NEXT: ; kill: def $vgpr32 killed $vgpr32 def $vgpr32_vgpr33 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v33, v34 +; GCN-O0-NEXT: v_mov_b32_e32 v34, v33 +; GCN-O0-NEXT: buffer_store_dword v34, off, s[56:59], 0 offset:20 ; 4-byte Folded Spill +; GCN-O0-NEXT: v_mov_b32_e32 v47, v32 +; GCN-O0-NEXT: v_mov_b32_e32 v34, v19 +; GCN-O0-NEXT: v_mov_b32_e32 v32, v18 +; GCN-O0-NEXT: ; kill: def $vgpr32 killed $vgpr32 def $vgpr32_vgpr33 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v33, v34 +; GCN-O0-NEXT: v_mov_b32_e32 v48, v33 +; GCN-O0-NEXT: v_mov_b32_e32 v49, v32 +; GCN-O0-NEXT: v_mov_b32_e32 v34, v17 +; GCN-O0-NEXT: v_mov_b32_e32 v32, v16 +; GCN-O0-NEXT: ; kill: def $vgpr32 killed $vgpr32 def $vgpr32_vgpr33 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v33, v34 +; GCN-O0-NEXT: v_mov_b32_e32 v50, v33 +; GCN-O0-NEXT: v_mov_b32_e32 v39, v32 +; GCN-O0-NEXT: v_mov_b32_e32 v34, v15 +; GCN-O0-NEXT: v_mov_b32_e32 v32, v14 +; GCN-O0-NEXT: ; kill: def $vgpr32 killed $vgpr32 def $vgpr32_vgpr33 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v33, v34 +; GCN-O0-NEXT: v_mov_b32_e32 v34, v33 +; GCN-O0-NEXT: buffer_store_dword v34, off, s[56:59], 0 offset:160 ; 4-byte Folded Spill +; GCN-O0-NEXT: ; kill: def $vgpr32 killed $vgpr32 killed $vgpr32_vgpr33 killed $exec +; GCN-O0-NEXT: buffer_store_dword v32, off, s[56:59], 0 offset:164 ; 4-byte Folded Spill +; GCN-O0-NEXT: v_mov_b32_e32 v34, v13 +; GCN-O0-NEXT: v_mov_b32_e32 v32, v12 +; GCN-O0-NEXT: ; kill: def $vgpr32 killed $vgpr32 def $vgpr32_vgpr33 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v33, v34 +; GCN-O0-NEXT: v_mov_b32_e32 v34, v33 +; GCN-O0-NEXT: buffer_store_dword v34, off, s[56:59], 0 offset:168 ; 4-byte Folded Spill +; GCN-O0-NEXT: ; kill: def $vgpr32 killed $vgpr32 killed $vgpr32_vgpr33 killed $exec +; GCN-O0-NEXT: buffer_store_dword v32, off, s[56:59], 0 offset:172 ; 4-byte Folded Spill +; GCN-O0-NEXT: v_mov_b32_e32 v34, v11 +; GCN-O0-NEXT: v_mov_b32_e32 v32, v10 +; GCN-O0-NEXT: ; kill: def $vgpr32 killed $vgpr32 def $vgpr32_vgpr33 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v33, v34 +; GCN-O0-NEXT: v_mov_b32_e32 v34, v33 +; GCN-O0-NEXT: buffer_store_dword v34, off, s[56:59], 0 offset:176 ; 4-byte Folded Spill +; GCN-O0-NEXT: ; kill: def $vgpr32 killed $vgpr32 killed $vgpr32_vgpr33 killed $exec +; GCN-O0-NEXT: buffer_store_dword v32, off, s[56:59], 0 offset:180 ; 4-byte Folded Spill +; GCN-O0-NEXT: v_mov_b32_e32 v34, v9 +; GCN-O0-NEXT: v_mov_b32_e32 v32, v8 +; GCN-O0-NEXT: ; kill: def $vgpr32 killed $vgpr32 def $vgpr32_vgpr33 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v33, v34 +; GCN-O0-NEXT: v_mov_b32_e32 v34, v33 +; GCN-O0-NEXT: buffer_store_dword v34, off, s[56:59], 0 offset:184 ; 4-byte Folded Spill +; GCN-O0-NEXT: ; kill: def $vgpr32 killed $vgpr32 killed $vgpr32_vgpr33 killed $exec +; GCN-O0-NEXT: buffer_store_dword v32, off, s[56:59], 0 offset:188 ; 4-byte Folded Spill +; GCN-O0-NEXT: v_mov_b32_e32 v34, v7 ; GCN-O0-NEXT: v_mov_b32_e32 v32, v6 -; GCN-O0-NEXT: v_mov_b32_e32 v33, v5 -; GCN-O0-NEXT: v_mov_b32_e32 v34, v4 -; GCN-O0-NEXT: v_mov_b32_e32 v35, v3 -; GCN-O0-NEXT: v_mov_b32_e32 v36, v2 -; GCN-O0-NEXT: v_mov_b32_e32 v37, v1 -; GCN-O0-NEXT: v_mov_b32_e32 v38, v0 -; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0x124 -; GCN-O0-NEXT: s_mov_b32 s1, 1 -; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: s_lshl_b32 s0, s0, s1 -; GCN-O0-NEXT: s_mov_b64 s[4:5], 1.0 -; GCN-O0-NEXT: s_mov_b32 s1, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s1 -; GCN-O0-NEXT: s_mov_b32 m0, s0 -; GCN-O0-NEXT: v_movreld_b32_e32 v7, v0 -; GCN-O0-NEXT: s_mov_b32 s1, s5 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s1 -; GCN-O0-NEXT: s_mov_b32 m0, s0 -; GCN-O0-NEXT: v_movreld_b32_e32 v8, v0 -; GCN-O0-NEXT: v_mov_b32_e32 v0, v34 -; GCN-O0-NEXT: v_mov_b32_e32 v1, v33 -; GCN-O0-NEXT: v_mov_b32_e32 v6, v32 -; GCN-O0-NEXT: v_mov_b32_e32 v2, v31 -; GCN-O0-NEXT: v_mov_b32_e32 v3, v26 -; GCN-O0-NEXT: v_mov_b32_e32 v4, v25 -; GCN-O0-NEXT: v_mov_b32_e32 v5, v24 -; GCN-O0-NEXT: v_mov_b32_e32 v39, v23 -; GCN-O0-NEXT: v_mov_b32_e32 v40, v30 -; GCN-O0-NEXT: v_mov_b32_e32 v41, v29 -; GCN-O0-NEXT: v_mov_b32_e32 v46, v28 -; GCN-O0-NEXT: v_mov_b32_e32 v42, v27 -; GCN-O0-NEXT: v_mov_b32_e32 v43, v10 -; GCN-O0-NEXT: v_mov_b32_e32 v44, v9 -; GCN-O0-NEXT: v_mov_b32_e32 v45, v8 -; GCN-O0-NEXT: v_mov_b32_e32 v47, v7 -; GCN-O0-NEXT: v_mov_b32_e32 v48, v14 -; GCN-O0-NEXT: v_mov_b32_e32 v49, v13 -; GCN-O0-NEXT: v_mov_b32_e32 v54, v12 -; GCN-O0-NEXT: v_mov_b32_e32 v50, v11 -; GCN-O0-NEXT: v_mov_b32_e32 v51, v18 -; GCN-O0-NEXT: v_mov_b32_e32 v52, v17 -; GCN-O0-NEXT: v_mov_b32_e32 v53, v16 -; GCN-O0-NEXT: v_mov_b32_e32 v55, v15 -; GCN-O0-NEXT: v_mov_b32_e32 v56, v22 -; GCN-O0-NEXT: v_mov_b32_e32 v57, v21 -; GCN-O0-NEXT: v_mov_b32_e32 v62, v20 -; GCN-O0-NEXT: v_mov_b32_e32 v58, v19 -; GCN-O0-NEXT: ; kill: def $vgpr58 killed $vgpr58 def $vgpr58_vgpr59_vgpr60_vgpr61 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v59, v62 -; GCN-O0-NEXT: v_mov_b32_e32 v60, v57 -; GCN-O0-NEXT: v_mov_b32_e32 v61, v56 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 48 -; GCN-O0-NEXT: s_mov_b32 s0, s2 -; GCN-O0-NEXT: s_mov_b32 s1, s3 -; GCN-O0-NEXT: s_mov_b32 s5, s6 -; GCN-O0-NEXT: s_mov_b32 s4, s7 -; GCN-O0-NEXT: s_add_u32 s0, s0, s5 -; GCN-O0-NEXT: s_addc_u32 s4, s1, s4 -; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 -; GCN-O0-NEXT: s_mov_b32 s1, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v57, s1 -; GCN-O0-NEXT: v_mov_b32_e32 v56, s0 -; GCN-O0-NEXT: flat_store_dwordx4 v[56:57], v[58:61] -; GCN-O0-NEXT: ; kill: def $vgpr55 killed $vgpr55 def $vgpr55_vgpr56_vgpr57_vgpr58 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v56, v53 -; GCN-O0-NEXT: v_mov_b32_e32 v57, v52 -; GCN-O0-NEXT: v_mov_b32_e32 v58, v51 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 32 -; GCN-O0-NEXT: s_mov_b32 s0, s2 -; GCN-O0-NEXT: s_mov_b32 s1, s3 -; GCN-O0-NEXT: s_mov_b32 s5, s6 -; GCN-O0-NEXT: s_mov_b32 s4, s7 -; GCN-O0-NEXT: s_add_u32 s0, s0, s5 -; GCN-O0-NEXT: s_addc_u32 s4, s1, s4 -; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 -; GCN-O0-NEXT: s_mov_b32 s1, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v52, s1 -; GCN-O0-NEXT: v_mov_b32_e32 v51, s0 -; GCN-O0-NEXT: flat_store_dwordx4 v[51:52], v[55:58] -; GCN-O0-NEXT: ; kill: def $vgpr50 killed $vgpr50 def $vgpr50_vgpr51_vgpr52_vgpr53 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v51, v54 -; GCN-O0-NEXT: v_mov_b32_e32 v52, v49 -; GCN-O0-NEXT: v_mov_b32_e32 v53, v48 +; GCN-O0-NEXT: ; kill: def $vgpr32 killed $vgpr32 def $vgpr32_vgpr33 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v33, v34 +; GCN-O0-NEXT: v_mov_b32_e32 v34, v33 +; GCN-O0-NEXT: buffer_store_dword v34, off, s[56:59], 0 offset:192 ; 4-byte Folded Spill +; GCN-O0-NEXT: ; kill: def $vgpr32 killed $vgpr32 killed $vgpr32_vgpr33 killed $exec +; GCN-O0-NEXT: buffer_store_dword v32, off, s[56:59], 0 offset:196 ; 4-byte Folded Spill +; GCN-O0-NEXT: v_mov_b32_e32 v34, v5 +; GCN-O0-NEXT: v_mov_b32_e32 v32, v4 +; GCN-O0-NEXT: ; kill: def $vgpr32 killed $vgpr32 def $vgpr32_vgpr33 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v33, v34 +; GCN-O0-NEXT: v_mov_b32_e32 v34, v33 +; GCN-O0-NEXT: buffer_store_dword v34, off, s[56:59], 0 offset:200 ; 4-byte Folded Spill +; GCN-O0-NEXT: ; kill: def $vgpr32 killed $vgpr32 killed $vgpr32_vgpr33 killed $exec +; GCN-O0-NEXT: buffer_store_dword v32, off, s[56:59], 0 offset:204 ; 4-byte Folded Spill +; GCN-O0-NEXT: v_mov_b32_e32 v34, v3 +; GCN-O0-NEXT: v_mov_b32_e32 v32, v2 +; GCN-O0-NEXT: ; kill: def $vgpr32 killed $vgpr32 def $vgpr32_vgpr33 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v33, v34 +; GCN-O0-NEXT: v_mov_b32_e32 v34, v33 +; GCN-O0-NEXT: buffer_store_dword v34, off, s[56:59], 0 offset:208 ; 4-byte Folded Spill +; GCN-O0-NEXT: ; kill: def $vgpr32 killed $vgpr32 killed $vgpr32_vgpr33 killed $exec +; GCN-O0-NEXT: buffer_store_dword v32, off, s[56:59], 0 offset:212 ; 4-byte Folded Spill +; GCN-O0-NEXT: v_mov_b32_e32 v32, v1 +; GCN-O0-NEXT: buffer_store_dword v32, off, s[56:59], 0 offset:156 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_load_dword v29, off, s[56:59], 0 offset:212 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v28, off, s[56:59], 0 offset:208 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v27, off, s[56:59], 0 offset:204 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v26, off, s[56:59], 0 offset:200 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v25, off, s[56:59], 0 offset:196 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v24, off, s[56:59], 0 offset:192 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v23, off, s[56:59], 0 offset:188 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v22, off, s[56:59], 0 offset:184 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v21, off, s[56:59], 0 offset:180 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v20, off, s[56:59], 0 offset:176 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v19, off, s[56:59], 0 offset:172 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v18, off, s[56:59], 0 offset:168 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v17, off, s[56:59], 0 offset:164 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v16, off, s[56:59], 0 offset:160 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v2, off, s[56:59], 0 offset:156 ; 4-byte Folded Reload +; GCN-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_mov_b32_e32 v1, v2 +; GCN-O0-NEXT: v_mov_b32_e32 v30, v1 +; GCN-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GCN-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v1, v30 +; GCN-O0-NEXT: v_mov_b32_e32 v2, v29 +; GCN-O0-NEXT: v_mov_b32_e32 v3, v28 +; GCN-O0-NEXT: v_mov_b32_e32 v4, v27 +; GCN-O0-NEXT: v_mov_b32_e32 v5, v26 +; GCN-O0-NEXT: v_mov_b32_e32 v6, v25 +; GCN-O0-NEXT: v_mov_b32_e32 v7, v24 +; GCN-O0-NEXT: v_mov_b32_e32 v8, v23 +; GCN-O0-NEXT: v_mov_b32_e32 v9, v22 +; GCN-O0-NEXT: v_mov_b32_e32 v10, v21 +; GCN-O0-NEXT: v_mov_b32_e32 v11, v20 +; GCN-O0-NEXT: v_mov_b32_e32 v12, v19 +; GCN-O0-NEXT: v_mov_b32_e32 v13, v18 +; GCN-O0-NEXT: v_mov_b32_e32 v14, v17 +; GCN-O0-NEXT: v_mov_b32_e32 v15, v16 +; GCN-O0-NEXT: v_mov_b32_e32 v16, v15 +; GCN-O0-NEXT: v_mov_b32_e32 v17, v14 +; GCN-O0-NEXT: v_mov_b32_e32 v22, v13 +; GCN-O0-NEXT: v_mov_b32_e32 v18, v12 +; GCN-O0-NEXT: ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19_vgpr20_vgpr21 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v19, v22 +; GCN-O0-NEXT: v_mov_b32_e32 v20, v17 +; GCN-O0-NEXT: v_mov_b32_e32 v21, v16 +; GCN-O0-NEXT: s_mov_b32 s2, s0 +; GCN-O0-NEXT: s_mov_b32 s3, s1 +; GCN-O0-NEXT: s_mov_b32 s7, s4 +; GCN-O0-NEXT: s_mov_b32 s6, s5 +; GCN-O0-NEXT: s_add_u32 s2, s2, s7 +; GCN-O0-NEXT: s_addc_u32 s6, s3, s6 +; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; GCN-O0-NEXT: s_mov_b32 s3, s6 ; GCN-O0-NEXT: s_mov_b64 s[6:7], 16 -; GCN-O0-NEXT: s_mov_b32 s0, s2 -; GCN-O0-NEXT: s_mov_b32 s1, s3 -; GCN-O0-NEXT: s_mov_b32 s5, s6 -; GCN-O0-NEXT: s_mov_b32 s4, s7 -; GCN-O0-NEXT: s_add_u32 s0, s0, s5 -; GCN-O0-NEXT: s_addc_u32 s4, s1, s4 -; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 -; GCN-O0-NEXT: s_mov_b32 s1, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v49, s1 -; GCN-O0-NEXT: v_mov_b32_e32 v48, s0 -; GCN-O0-NEXT: flat_store_dwordx4 v[48:49], v[50:53] -; GCN-O0-NEXT: ; kill: def $vgpr47 killed $vgpr47 def $vgpr47_vgpr48_vgpr49_vgpr50 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v48, v45 -; GCN-O0-NEXT: v_mov_b32_e32 v49, v44 -; GCN-O0-NEXT: v_mov_b32_e32 v50, v43 -; GCN-O0-NEXT: v_mov_b32_e32 v44, s3 -; GCN-O0-NEXT: v_mov_b32_e32 v43, s2 -; GCN-O0-NEXT: flat_store_dwordx4 v[43:44], v[47:50] +; GCN-O0-NEXT: s_mov_b32 s10, s2 +; GCN-O0-NEXT: s_mov_b32 s11, s3 +; GCN-O0-NEXT: s_mov_b32 s13, s6 +; GCN-O0-NEXT: s_mov_b32 s12, s7 +; GCN-O0-NEXT: s_add_u32 s10, s10, s13 +; GCN-O0-NEXT: s_addc_u32 s12, s11, s12 +; GCN-O0-NEXT: ; kill: def $sgpr10 killed $sgpr10 def $sgpr10_sgpr11 +; GCN-O0-NEXT: s_mov_b32 s11, s12 +; GCN-O0-NEXT: v_mov_b32_e32 v17, s11 +; GCN-O0-NEXT: v_mov_b32_e32 v16, s10 +; GCN-O0-NEXT: flat_store_dwordx4 v[16:17], v[18:21] +; GCN-O0-NEXT: v_mov_b32_e32 v16, v11 +; GCN-O0-NEXT: v_mov_b32_e32 v17, v10 +; GCN-O0-NEXT: v_mov_b32_e32 v22, v9 +; GCN-O0-NEXT: v_mov_b32_e32 v18, v8 +; GCN-O0-NEXT: ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19_vgpr20_vgpr21 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v19, v22 +; GCN-O0-NEXT: v_mov_b32_e32 v20, v17 +; GCN-O0-NEXT: v_mov_b32_e32 v21, v16 +; GCN-O0-NEXT: v_mov_b32_e32 v17, s3 +; GCN-O0-NEXT: v_mov_b32_e32 v16, s2 +; GCN-O0-NEXT: flat_store_dwordx4 v[16:17], v[18:21] +; GCN-O0-NEXT: v_mov_b32_e32 v16, v7 +; GCN-O0-NEXT: v_mov_b32_e32 v17, v6 +; GCN-O0-NEXT: v_mov_b32_e32 v22, v5 +; GCN-O0-NEXT: v_mov_b32_e32 v18, v4 +; GCN-O0-NEXT: ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19_vgpr20_vgpr21 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v19, v22 +; GCN-O0-NEXT: v_mov_b32_e32 v20, v17 +; GCN-O0-NEXT: v_mov_b32_e32 v21, v16 +; GCN-O0-NEXT: s_mov_b32 s2, s0 +; GCN-O0-NEXT: s_mov_b32 s3, s1 +; GCN-O0-NEXT: s_mov_b32 s11, s6 +; GCN-O0-NEXT: s_mov_b32 s10, s7 +; GCN-O0-NEXT: s_add_u32 s2, s2, s11 +; GCN-O0-NEXT: s_addc_u32 s10, s3, s10 +; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; GCN-O0-NEXT: s_mov_b32 s3, s10 +; GCN-O0-NEXT: v_mov_b32_e32 v17, s3 +; GCN-O0-NEXT: v_mov_b32_e32 v16, s2 +; GCN-O0-NEXT: flat_store_dwordx4 v[16:17], v[18:21] +; GCN-O0-NEXT: v_mov_b32_e32 v40, v3 +; GCN-O0-NEXT: v_mov_b32_e32 v41, v2 +; GCN-O0-NEXT: v_mov_b32_e32 v46, v1 +; GCN-O0-NEXT: v_mov_b32_e32 v42, v0 +; GCN-O0-NEXT: buffer_load_dword v7, off, s[56:59], 0 offset:28 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v8, off, s[56:59], 0 offset:32 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v9, off, s[56:59], 0 offset:36 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v10, off, s[56:59], 0 offset:40 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v11, off, s[56:59], 0 offset:44 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v12, off, s[56:59], 0 offset:48 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v13, off, s[56:59], 0 offset:52 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v14, off, s[56:59], 0 offset:56 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v15, off, s[56:59], 0 offset:60 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v16, off, s[56:59], 0 offset:64 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v17, off, s[56:59], 0 offset:68 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v18, off, s[56:59], 0 offset:72 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v19, off, s[56:59], 0 offset:76 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v20, off, s[56:59], 0 offset:80 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v21, off, s[56:59], 0 offset:84 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v22, off, s[56:59], 0 offset:88 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v23, off, s[56:59], 0 offset:92 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v24, off, s[56:59], 0 offset:96 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v25, off, s[56:59], 0 offset:100 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v26, off, s[56:59], 0 offset:104 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v27, off, s[56:59], 0 offset:108 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v28, off, s[56:59], 0 offset:112 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v29, off, s[56:59], 0 offset:116 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v30, off, s[56:59], 0 offset:120 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v31, off, s[56:59], 0 offset:124 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v32, off, s[56:59], 0 offset:128 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v33, off, s[56:59], 0 offset:132 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v34, off, s[56:59], 0 offset:136 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v35, off, s[56:59], 0 offset:140 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v36, off, s[56:59], 0 offset:144 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v37, off, s[56:59], 0 offset:148 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v38, off, s[56:59], 0 offset:152 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v6, off, s[56:59], 0 offset:24 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v5, off, s[56:59], 0 offset:20 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v4, off, s[56:59], 0 offset:16 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v3, off, s[56:59], 0 offset:12 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v2, off, s[56:59], 0 offset:8 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v1, off, s[56:59], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v0, off, s[56:59], 0 ; 4-byte Folded Reload ; GCN-O0-NEXT: ; kill: def $vgpr42 killed $vgpr42 def $vgpr42_vgpr43_vgpr44_vgpr45 killed $exec ; GCN-O0-NEXT: v_mov_b32_e32 v43, v46 ; GCN-O0-NEXT: v_mov_b32_e32 v44, v41 ; GCN-O0-NEXT: v_mov_b32_e32 v45, v40 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 0x50 -; GCN-O0-NEXT: s_mov_b32 s0, s2 -; GCN-O0-NEXT: s_mov_b32 s1, s3 -; GCN-O0-NEXT: s_mov_b32 s5, s6 -; GCN-O0-NEXT: s_mov_b32 s4, s7 -; GCN-O0-NEXT: s_add_u32 s0, s0, s5 -; GCN-O0-NEXT: s_addc_u32 s4, s1, s4 -; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 -; GCN-O0-NEXT: s_mov_b32 s1, s4 ; GCN-O0-NEXT: v_mov_b32_e32 v41, s1 ; GCN-O0-NEXT: v_mov_b32_e32 v40, s0 ; GCN-O0-NEXT: flat_store_dwordx4 v[40:41], v[42:45] +; GCN-O0-NEXT: ; kill: def $vgpr39 killed $vgpr39 def $vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v40, v50 +; GCN-O0-NEXT: v_mov_b32_e32 v41, v49 +; GCN-O0-NEXT: v_mov_b32_e32 v42, v48 +; GCN-O0-NEXT: v_mov_b32_e32 v43, v47 +; GCN-O0-NEXT: s_waitcnt vmcnt(6) +; GCN-O0-NEXT: v_mov_b32_e32 v44, v5 +; GCN-O0-NEXT: s_waitcnt vmcnt(5) +; GCN-O0-NEXT: v_mov_b32_e32 v45, v4 +; GCN-O0-NEXT: s_waitcnt vmcnt(4) +; GCN-O0-NEXT: v_mov_b32_e32 v46, v3 +; GCN-O0-NEXT: v_mov_b32_e32 v3, v46 +; GCN-O0-NEXT: v_mov_b32_e32 v4, v45 +; GCN-O0-NEXT: v_mov_b32_e32 v5, v44 +; GCN-O0-NEXT: v_mov_b32_e32 v47, v43 +; GCN-O0-NEXT: ; kill: def $vgpr47 killed $vgpr47 def $vgpr47_vgpr48_vgpr49_vgpr50 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v48, v5 +; GCN-O0-NEXT: v_mov_b32_e32 v49, v4 +; GCN-O0-NEXT: v_mov_b32_e32 v50, v3 +; GCN-O0-NEXT: s_mov_b32 s2, s0 +; GCN-O0-NEXT: s_mov_b32 s0, s1 +; GCN-O0-NEXT: s_mov_b32 s3, s8 +; GCN-O0-NEXT: s_mov_b32 s1, s9 +; GCN-O0-NEXT: s_add_u32 s2, s2, s3 +; GCN-O0-NEXT: s_addc_u32 s0, s0, s1 +; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; GCN-O0-NEXT: s_mov_b32 s3, s0 +; GCN-O0-NEXT: s_mov_b32 s0, s2 +; GCN-O0-NEXT: s_mov_b32 s1, s3 +; GCN-O0-NEXT: s_mov_b32 s9, s6 +; GCN-O0-NEXT: s_mov_b32 s8, s7 +; GCN-O0-NEXT: s_add_u32 s0, s0, s9 +; GCN-O0-NEXT: s_addc_u32 s8, s1, s8 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s8 +; GCN-O0-NEXT: v_mov_b32_e32 v4, s1 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s0 +; GCN-O0-NEXT: flat_store_dwordx4 v[3:4], v[47:50] +; GCN-O0-NEXT: v_mov_b32_e32 v3, v42 +; GCN-O0-NEXT: v_mov_b32_e32 v4, v41 +; GCN-O0-NEXT: v_mov_b32_e32 v5, v40 +; GCN-O0-NEXT: ; kill: def $vgpr39 killed $vgpr39 killed $vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46 killed $exec ; GCN-O0-NEXT: ; kill: def $vgpr39 killed $vgpr39 def $vgpr39_vgpr40_vgpr41_vgpr42 killed $exec ; GCN-O0-NEXT: v_mov_b32_e32 v40, v5 ; GCN-O0-NEXT: v_mov_b32_e32 v41, v4 ; GCN-O0-NEXT: v_mov_b32_e32 v42, v3 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 64 -; GCN-O0-NEXT: s_mov_b32 s0, s2 -; GCN-O0-NEXT: s_mov_b32 s1, s3 -; GCN-O0-NEXT: s_mov_b32 s5, s6 -; GCN-O0-NEXT: s_mov_b32 s4, s7 -; GCN-O0-NEXT: s_add_u32 s0, s0, s5 -; GCN-O0-NEXT: s_addc_u32 s4, s1, s4 -; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 -; GCN-O0-NEXT: s_mov_b32 s1, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v4, s1 -; GCN-O0-NEXT: v_mov_b32_e32 v3, s0 +; GCN-O0-NEXT: v_mov_b32_e32 v4, s3 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s2 ; GCN-O0-NEXT: flat_store_dwordx4 v[3:4], v[39:42] ; GCN-O0-NEXT: v_mov_b32_e32 v3, v36 ; GCN-O0-NEXT: v_mov_b32_e32 v7, v35 ; GCN-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec ; GCN-O0-NEXT: v_mov_b32_e32 v8, v3 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 0x70 -; GCN-O0-NEXT: s_mov_b32 s0, s2 -; GCN-O0-NEXT: s_mov_b32 s1, s3 -; GCN-O0-NEXT: s_mov_b32 s5, s6 -; GCN-O0-NEXT: s_mov_b32 s4, s7 -; GCN-O0-NEXT: s_add_u32 s0, s0, s5 -; GCN-O0-NEXT: s_addc_u32 s4, s1, s4 -; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 -; GCN-O0-NEXT: s_mov_b32 s1, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v4, s1 -; GCN-O0-NEXT: v_mov_b32_e32 v3, s0 -; GCN-O0-NEXT: flat_store_dwordx2 v[3:4], v[7:8] -; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 -; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 -; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 -; GCN-O0-NEXT: s_mov_b64 s[4:5], 0x60 ; GCN-O0-NEXT: s_mov_b32 s0, s2 ; GCN-O0-NEXT: s_mov_b32 s1, s3 ; GCN-O0-NEXT: s_mov_b32 s3, s4 @@ -3057,6 +3829,23 @@ define amdgpu_kernel void @double15_inselt(ptr addrspace(1) %out, <15 x double> ; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 ; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; GCN-O0-NEXT: s_mov_b32 s1, s2 +; GCN-O0-NEXT: s_mov_b32 s2, s0 +; GCN-O0-NEXT: s_mov_b32 s3, s1 +; GCN-O0-NEXT: s_mov_b32 s5, s6 +; GCN-O0-NEXT: s_mov_b32 s4, s7 +; GCN-O0-NEXT: s_add_u32 s2, s2, s5 +; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 +; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; GCN-O0-NEXT: s_mov_b32 s3, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v4, s3 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s2 +; GCN-O0-NEXT: flat_store_dwordx2 v[3:4], v[7:8] +; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 +; GCN-O0-NEXT: s_waitcnt vmcnt(5) +; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 +; GCN-O0-NEXT: s_waitcnt vmcnt(4) +; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 ; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 ; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 ; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] @@ -3125,46 +3914,91 @@ define amdgpu_kernel void @bit4_inselt(ptr addrspace(1) %out, <4 x i1> %vec, i32 ; GCN-O0-NEXT: s_mov_b32 s15, 0xe80000 ; GCN-O0-NEXT: s_add_u32 s12, s12, s11 ; GCN-O0-NEXT: s_addc_u32 s13, s13, 0 -; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5] -; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-O0-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-O0-NEXT: s_load_dword s2, s[2:3], 0x30 +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: s_bfe_u32 s3, s4, 0x10001 -; GCN-O0-NEXT: s_bfe_u32 s5, s4, 0x20002 -; GCN-O0-NEXT: s_bfe_u32 s6, s4, 0x10003 -; GCN-O0-NEXT: s_mov_b32 s7, 3 -; GCN-O0-NEXT: s_and_b32 s7, s2, s7 +; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0x2c +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0x30 +; GCN-O0-NEXT: s_mov_b64 s[6:7], 44 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_mov_b32 s0, s4 +; GCN-O0-NEXT: s_mov_b32 s1, s5 +; GCN-O0-NEXT: s_mov_b32 s3, s6 +; GCN-O0-NEXT: s_mov_b32 s2, s7 +; GCN-O0-NEXT: s_add_u32 s0, s0, s3 +; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 +; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-O0-NEXT: s_load_dword s6, s[4:5], 0x2c +; GCN-O0-NEXT: s_load_dword s2, s[4:5], 0x30 +; GCN-O0-NEXT: s_mov_b32 s10, 3 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_and_b32 s3, s6, s10 +; GCN-O0-NEXT: s_mov_b32 s7, 0xffff +; GCN-O0-NEXT: s_and_b32 s4, s7, s3 +; GCN-O0-NEXT: s_mov_b32 s3, 1 +; GCN-O0-NEXT: s_lshr_b32 s5, s4, s3 +; GCN-O0-NEXT: s_mov_b32 s4, 15 +; GCN-O0-NEXT: s_and_b32 s4, s6, s4 +; GCN-O0-NEXT: s_and_b32 s4, s7, s4 +; GCN-O0-NEXT: s_mov_b32 s9, 2 +; GCN-O0-NEXT: s_lshr_b32 s4, s4, s9 +; GCN-O0-NEXT: s_mov_b32 s8, s4 +; GCN-O0-NEXT: s_and_b32 s8, s8, s10 +; GCN-O0-NEXT: s_and_b32 s7, s7, s8 +; GCN-O0-NEXT: s_lshr_b32 s8, s7, s3 +; GCN-O0-NEXT: s_and_b32 s6, 1, s6 +; GCN-O0-NEXT: s_cmp_eq_u32 s6, 1 +; GCN-O0-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GCN-O0-NEXT: s_and_b32 s2, s2, s10 +; GCN-O0-NEXT: s_mul_i32 s3, s2, s3 ; GCN-O0-NEXT: s_mov_b32 s2, 0 -; GCN-O0-NEXT: s_or_b32 s2, s2, s7 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s6 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:3 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s5 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:2 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s3 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:1 -; GCN-O0-NEXT: v_mov_b32_e32 v3, 1 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: buffer_store_byte v3, v0, s[12:15], 0 offen -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[12:15], 0 -; GCN-O0-NEXT: buffer_load_ubyte v4, off, s[12:15], 0 offset:1 -; GCN-O0-NEXT: buffer_load_ubyte v2, off, s[12:15], 0 offset:2 -; GCN-O0-NEXT: buffer_load_ubyte v1, off, s[12:15], 0 offset:3 -; GCN-O0-NEXT: s_waitcnt vmcnt(3) -; GCN-O0-NEXT: v_and_b32_e64 v0, v0, v3 -; GCN-O0-NEXT: s_waitcnt vmcnt(2) -; GCN-O0-NEXT: v_and_b32_e64 v4, v4, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v4, v3, v4 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v4 -; GCN-O0-NEXT: s_waitcnt vmcnt(1) -; GCN-O0-NEXT: v_and_b32_e64 v2, v2, v3 -; GCN-O0-NEXT: s_mov_b32 s2, 2 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s2, v2 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v2 -; GCN-O0-NEXT: s_mov_b32 s2, 3 +; GCN-O0-NEXT: s_add_i32 s3, s2, s3 +; GCN-O0-NEXT: s_add_i32 s2, s2, s9 ; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_mov_b32_e32 v0, s8 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s2 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[12:15], 0 offen offset:1 +; GCN-O0-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[6:7] +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s5 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:2 +; GCN-O0-NEXT: v_mov_b32_e32 v0, 1 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[12:15], 0 offen +; GCN-O0-NEXT: v_mov_b32_e32 v1, s2 +; GCN-O0-NEXT: buffer_load_ubyte v1, v1, s[12:15], 0 offen offset:1 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v1, 1, v1 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[2:3], v1, 1 +; GCN-O0-NEXT: buffer_load_ubyte v3, off, s[12:15], 0 +; GCN-O0-NEXT: buffer_load_ubyte v2, off, s[12:15], 0 offset:1 +; GCN-O0-NEXT: buffer_load_ubyte v1, off, s[12:15], 0 offset:2 +; GCN-O0-NEXT: s_waitcnt vmcnt(2) +; GCN-O0-NEXT: v_and_b32_e64 v3, 1, v3 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, 1 +; GCN-O0-NEXT: s_waitcnt vmcnt(1) +; GCN-O0-NEXT: v_and_b32_e64 v2, 1, v2 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v2, 1 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v1, 1, v1 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[8:9], v1, 1 +; GCN-O0-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[8:9] +; GCN-O0-NEXT: s_mov_b32 s8, 2 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s8, v1 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[6:7] +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, v0, v2 +; GCN-O0-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v2 +; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 +; GCN-O0-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] +; GCN-O0-NEXT: s_mov_b32 s2, 3 ; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s2, v1 ; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 ; GCN-O0-NEXT: s_mov_b32 s2, 15 @@ -4018,13 +4852,79 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; ; GCN-O0-LABEL: bit128_inselt: ; GCN-O0: ; %bb.0: ; %entry -; GCN-O0-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 -; GCN-O0-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 -; GCN-O0-NEXT: s_mov_b32 s18, -1 -; GCN-O0-NEXT: s_mov_b32 s19, 0xe80000 -; GCN-O0-NEXT: s_add_u32 s16, s16, s11 -; GCN-O0-NEXT: s_addc_u32 s17, s17, 0 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 52 +; GCN-O0-NEXT: s_mov_b32 s96, SCRATCH_RSRC_DWORD0 +; GCN-O0-NEXT: s_mov_b32 s97, SCRATCH_RSRC_DWORD1 +; GCN-O0-NEXT: s_mov_b32 s98, -1 +; GCN-O0-NEXT: s_mov_b32 s99, 0xe80000 +; GCN-O0-NEXT: s_add_u32 s96, s96, s11 +; GCN-O0-NEXT: s_addc_u32 s97, s97, 0 +; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0xb0 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0xac +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0xa8 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0xa4 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0xa0 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0x9c +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0x98 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0x94 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0x90 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0x8c +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0x88 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0x84 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0x80 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0x7c +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0x78 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0x74 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0x70 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0x6c +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0x68 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0x64 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0x34 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0x38 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0x3c +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0x40 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0x44 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0x48 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0x4c +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0x50 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0x54 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0x58 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0x5c +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0x60 +; GCN-O0-NEXT: s_mov_b64 s[6:7], 0xb0 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) ; GCN-O0-NEXT: s_mov_b32 s0, s4 ; GCN-O0-NEXT: s_mov_b32 s1, s5 ; GCN-O0-NEXT: s_mov_b32 s3, s6 @@ -4036,1572 +4936,3059 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 ; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 ; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] -; GCN-O0-NEXT: s_mov_b32 s1, 1 +; GCN-O0-NEXT: s_mov_b64 s[6:7], 0xac +; GCN-O0-NEXT: s_mov_b32 s0, s4 +; GCN-O0-NEXT: s_mov_b32 s1, s5 +; GCN-O0-NEXT: s_mov_b32 s3, s6 +; GCN-O0-NEXT: s_mov_b32 s2, s7 +; GCN-O0-NEXT: s_add_u32 s0, s0, s3 +; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s2 ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v1, v0, s1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:388 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_bfe_u32 v1, v0, 1, 1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:648 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_bfe_u32 v2, v0, 2, 1 -; GCN-O0-NEXT: v_bfe_u32 v3, v0, 3, 1 -; GCN-O0-NEXT: v_bfe_u32 v4, v0, 4, 1 -; GCN-O0-NEXT: v_bfe_u32 v5, v0, 5, 1 -; GCN-O0-NEXT: v_bfe_u32 v6, v0, 6, 1 -; GCN-O0-NEXT: s_mov_b32 s0, 7 -; GCN-O0-NEXT: v_lshrrev_b32_e64 v7, s0, v0 -; GCN-O0-NEXT: s_mov_b64 s[8:9], 53 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 +; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] +; GCN-O0-NEXT: s_mov_b64 s[6:7], 0xa8 +; GCN-O0-NEXT: s_mov_b32 s0, s4 +; GCN-O0-NEXT: s_mov_b32 s1, s5 +; GCN-O0-NEXT: s_mov_b32 s3, s6 +; GCN-O0-NEXT: s_mov_b32 s2, s7 +; GCN-O0-NEXT: s_add_u32 s0, s0, s3 +; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s2 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 +; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] +; GCN-O0-NEXT: s_mov_b64 s[6:7], 0xa4 +; GCN-O0-NEXT: s_mov_b32 s0, s4 +; GCN-O0-NEXT: s_mov_b32 s1, s5 +; GCN-O0-NEXT: s_mov_b32 s3, s6 +; GCN-O0-NEXT: s_mov_b32 s2, s7 +; GCN-O0-NEXT: s_add_u32 s0, s0, s3 +; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s2 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 +; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] +; GCN-O0-NEXT: s_mov_b64 s[6:7], 0xa0 +; GCN-O0-NEXT: s_mov_b32 s0, s4 +; GCN-O0-NEXT: s_mov_b32 s1, s5 +; GCN-O0-NEXT: s_mov_b32 s3, s6 +; GCN-O0-NEXT: s_mov_b32 s2, s7 +; GCN-O0-NEXT: s_add_u32 s0, s0, s3 +; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s2 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 +; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] +; GCN-O0-NEXT: s_mov_b64 s[6:7], 0x9c +; GCN-O0-NEXT: s_mov_b32 s0, s4 +; GCN-O0-NEXT: s_mov_b32 s1, s5 +; GCN-O0-NEXT: s_mov_b32 s3, s6 +; GCN-O0-NEXT: s_mov_b32 s2, s7 +; GCN-O0-NEXT: s_add_u32 s0, s0, s3 +; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s2 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 +; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] +; GCN-O0-NEXT: s_mov_b64 s[6:7], 0x98 +; GCN-O0-NEXT: s_mov_b32 s0, s4 +; GCN-O0-NEXT: s_mov_b32 s1, s5 +; GCN-O0-NEXT: s_mov_b32 s3, s6 +; GCN-O0-NEXT: s_mov_b32 s2, s7 +; GCN-O0-NEXT: s_add_u32 s0, s0, s3 +; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s2 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 +; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] +; GCN-O0-NEXT: s_mov_b64 s[6:7], 0x94 +; GCN-O0-NEXT: s_mov_b32 s0, s4 +; GCN-O0-NEXT: s_mov_b32 s1, s5 +; GCN-O0-NEXT: s_mov_b32 s3, s6 +; GCN-O0-NEXT: s_mov_b32 s2, s7 +; GCN-O0-NEXT: s_add_u32 s0, s0, s3 +; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s2 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 +; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] +; GCN-O0-NEXT: s_mov_b64 s[6:7], 0x90 +; GCN-O0-NEXT: s_mov_b32 s0, s4 +; GCN-O0-NEXT: s_mov_b32 s1, s5 +; GCN-O0-NEXT: s_mov_b32 s3, s6 +; GCN-O0-NEXT: s_mov_b32 s2, s7 +; GCN-O0-NEXT: s_add_u32 s0, s0, s3 +; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s2 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 +; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] +; GCN-O0-NEXT: s_mov_b64 s[6:7], 0x8c +; GCN-O0-NEXT: s_mov_b32 s0, s4 +; GCN-O0-NEXT: s_mov_b32 s1, s5 +; GCN-O0-NEXT: s_mov_b32 s3, s6 +; GCN-O0-NEXT: s_mov_b32 s2, s7 +; GCN-O0-NEXT: s_add_u32 s0, s0, s3 +; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s2 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 +; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] +; GCN-O0-NEXT: s_mov_b64 s[6:7], 0x88 +; GCN-O0-NEXT: s_mov_b32 s0, s4 +; GCN-O0-NEXT: s_mov_b32 s1, s5 +; GCN-O0-NEXT: s_mov_b32 s3, s6 +; GCN-O0-NEXT: s_mov_b32 s2, s7 +; GCN-O0-NEXT: s_add_u32 s0, s0, s3 +; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s2 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 +; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] +; GCN-O0-NEXT: s_mov_b64 s[6:7], 0x84 +; GCN-O0-NEXT: s_mov_b32 s0, s4 +; GCN-O0-NEXT: s_mov_b32 s1, s5 +; GCN-O0-NEXT: s_mov_b32 s3, s6 +; GCN-O0-NEXT: s_mov_b32 s2, s7 +; GCN-O0-NEXT: s_add_u32 s0, s0, s3 +; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s2 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 +; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] +; GCN-O0-NEXT: s_mov_b64 s[6:7], 0x80 +; GCN-O0-NEXT: s_mov_b32 s0, s4 +; GCN-O0-NEXT: s_mov_b32 s1, s5 +; GCN-O0-NEXT: s_mov_b32 s3, s6 +; GCN-O0-NEXT: s_mov_b32 s2, s7 +; GCN-O0-NEXT: s_add_u32 s0, s0, s3 +; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s2 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 +; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] +; GCN-O0-NEXT: s_mov_b64 s[6:7], 0x7c +; GCN-O0-NEXT: s_mov_b32 s0, s4 +; GCN-O0-NEXT: s_mov_b32 s1, s5 +; GCN-O0-NEXT: s_mov_b32 s3, s6 +; GCN-O0-NEXT: s_mov_b32 s2, s7 +; GCN-O0-NEXT: s_add_u32 s0, s0, s3 +; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s2 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 +; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] +; GCN-O0-NEXT: s_mov_b64 s[6:7], 0x78 +; GCN-O0-NEXT: s_mov_b32 s0, s4 +; GCN-O0-NEXT: s_mov_b32 s1, s5 +; GCN-O0-NEXT: s_mov_b32 s3, s6 +; GCN-O0-NEXT: s_mov_b32 s2, s7 +; GCN-O0-NEXT: s_add_u32 s0, s0, s3 +; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s2 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 +; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] +; GCN-O0-NEXT: s_mov_b64 s[6:7], 0x74 +; GCN-O0-NEXT: s_mov_b32 s0, s4 +; GCN-O0-NEXT: s_mov_b32 s1, s5 +; GCN-O0-NEXT: s_mov_b32 s3, s6 +; GCN-O0-NEXT: s_mov_b32 s2, s7 +; GCN-O0-NEXT: s_add_u32 s0, s0, s3 +; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s2 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 +; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] +; GCN-O0-NEXT: s_mov_b64 s[6:7], 0x70 +; GCN-O0-NEXT: s_mov_b32 s0, s4 +; GCN-O0-NEXT: s_mov_b32 s1, s5 +; GCN-O0-NEXT: s_mov_b32 s3, s6 +; GCN-O0-NEXT: s_mov_b32 s2, s7 +; GCN-O0-NEXT: s_add_u32 s0, s0, s3 +; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s2 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 +; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] +; GCN-O0-NEXT: s_mov_b64 s[6:7], 0x6c +; GCN-O0-NEXT: s_mov_b32 s0, s4 +; GCN-O0-NEXT: s_mov_b32 s1, s5 +; GCN-O0-NEXT: s_mov_b32 s3, s6 +; GCN-O0-NEXT: s_mov_b32 s2, s7 +; GCN-O0-NEXT: s_add_u32 s0, s0, s3 +; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s2 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 +; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] +; GCN-O0-NEXT: s_mov_b64 s[6:7], 0x68 +; GCN-O0-NEXT: s_mov_b32 s0, s4 +; GCN-O0-NEXT: s_mov_b32 s1, s5 +; GCN-O0-NEXT: s_mov_b32 s3, s6 +; GCN-O0-NEXT: s_mov_b32 s2, s7 +; GCN-O0-NEXT: s_add_u32 s0, s0, s3 +; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s2 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 +; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] +; GCN-O0-NEXT: s_mov_b64 s[6:7], 0x64 +; GCN-O0-NEXT: s_mov_b32 s0, s4 +; GCN-O0-NEXT: s_mov_b32 s1, s5 +; GCN-O0-NEXT: s_mov_b32 s3, s6 +; GCN-O0-NEXT: s_mov_b32 s2, s7 +; GCN-O0-NEXT: s_add_u32 s0, s0, s3 +; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s2 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 +; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] +; GCN-O0-NEXT: s_mov_b64 s[6:7], 0x60 +; GCN-O0-NEXT: s_mov_b32 s0, s4 +; GCN-O0-NEXT: s_mov_b32 s1, s5 +; GCN-O0-NEXT: s_mov_b32 s3, s6 +; GCN-O0-NEXT: s_mov_b32 s2, s7 +; GCN-O0-NEXT: s_add_u32 s0, s0, s3 +; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s2 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 +; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] +; GCN-O0-NEXT: s_mov_b64 s[6:7], 0x5c +; GCN-O0-NEXT: s_mov_b32 s0, s4 +; GCN-O0-NEXT: s_mov_b32 s1, s5 +; GCN-O0-NEXT: s_mov_b32 s3, s6 +; GCN-O0-NEXT: s_mov_b32 s2, s7 +; GCN-O0-NEXT: s_add_u32 s0, s0, s3 +; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s2 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 +; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] +; GCN-O0-NEXT: s_mov_b64 s[6:7], 0x58 +; GCN-O0-NEXT: s_mov_b32 s0, s4 +; GCN-O0-NEXT: s_mov_b32 s1, s5 +; GCN-O0-NEXT: s_mov_b32 s3, s6 +; GCN-O0-NEXT: s_mov_b32 s2, s7 +; GCN-O0-NEXT: s_add_u32 s0, s0, s3 +; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s2 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 +; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] +; GCN-O0-NEXT: s_mov_b64 s[6:7], 0x54 +; GCN-O0-NEXT: s_mov_b32 s0, s4 +; GCN-O0-NEXT: s_mov_b32 s1, s5 +; GCN-O0-NEXT: s_mov_b32 s3, s6 +; GCN-O0-NEXT: s_mov_b32 s2, s7 +; GCN-O0-NEXT: s_add_u32 s0, s0, s3 +; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s2 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 +; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] +; GCN-O0-NEXT: s_mov_b64 s[6:7], 0x50 +; GCN-O0-NEXT: s_mov_b32 s0, s4 +; GCN-O0-NEXT: s_mov_b32 s1, s5 +; GCN-O0-NEXT: s_mov_b32 s3, s6 +; GCN-O0-NEXT: s_mov_b32 s2, s7 +; GCN-O0-NEXT: s_add_u32 s0, s0, s3 +; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s2 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 +; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] +; GCN-O0-NEXT: s_mov_b64 s[6:7], 0x4c +; GCN-O0-NEXT: s_mov_b32 s0, s4 +; GCN-O0-NEXT: s_mov_b32 s1, s5 +; GCN-O0-NEXT: s_mov_b32 s3, s6 +; GCN-O0-NEXT: s_mov_b32 s2, s7 +; GCN-O0-NEXT: s_add_u32 s0, s0, s3 +; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s2 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 +; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] +; GCN-O0-NEXT: s_mov_b64 s[6:7], 0x48 +; GCN-O0-NEXT: s_mov_b32 s0, s4 +; GCN-O0-NEXT: s_mov_b32 s1, s5 +; GCN-O0-NEXT: s_mov_b32 s3, s6 +; GCN-O0-NEXT: s_mov_b32 s2, s7 +; GCN-O0-NEXT: s_add_u32 s0, s0, s3 +; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s2 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 +; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] +; GCN-O0-NEXT: s_mov_b64 s[6:7], 0x44 +; GCN-O0-NEXT: s_mov_b32 s0, s4 +; GCN-O0-NEXT: s_mov_b32 s1, s5 +; GCN-O0-NEXT: s_mov_b32 s3, s6 +; GCN-O0-NEXT: s_mov_b32 s2, s7 +; GCN-O0-NEXT: s_add_u32 s0, s0, s3 +; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s2 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 +; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] +; GCN-O0-NEXT: s_mov_b64 s[6:7], 64 +; GCN-O0-NEXT: s_mov_b32 s0, s4 +; GCN-O0-NEXT: s_mov_b32 s1, s5 +; GCN-O0-NEXT: s_mov_b32 s3, s6 +; GCN-O0-NEXT: s_mov_b32 s2, s7 +; GCN-O0-NEXT: s_add_u32 s0, s0, s3 +; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s2 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 +; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] +; GCN-O0-NEXT: s_mov_b64 s[6:7], 60 +; GCN-O0-NEXT: s_mov_b32 s0, s4 +; GCN-O0-NEXT: s_mov_b32 s1, s5 +; GCN-O0-NEXT: s_mov_b32 s3, s6 +; GCN-O0-NEXT: s_mov_b32 s2, s7 +; GCN-O0-NEXT: s_add_u32 s0, s0, s3 +; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s2 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 +; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] +; GCN-O0-NEXT: s_mov_b64 s[6:7], 56 +; GCN-O0-NEXT: s_mov_b32 s0, s4 +; GCN-O0-NEXT: s_mov_b32 s1, s5 +; GCN-O0-NEXT: s_mov_b32 s3, s6 +; GCN-O0-NEXT: s_mov_b32 s2, s7 +; GCN-O0-NEXT: s_add_u32 s0, s0, s3 +; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s2 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 +; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] +; GCN-O0-NEXT: s_mov_b64 s[6:7], 52 ; GCN-O0-NEXT: s_mov_b32 s2, s4 -; GCN-O0-NEXT: s_mov_b32 s3, s5 -; GCN-O0-NEXT: s_mov_b32 s7, s8 -; GCN-O0-NEXT: s_mov_b32 s6, s9 -; GCN-O0-NEXT: s_add_u32 s2, s2, s7 +; GCN-O0-NEXT: s_mov_b32 s0, s5 +; GCN-O0-NEXT: s_mov_b32 s3, s6 +; GCN-O0-NEXT: s_mov_b32 s1, s7 +; GCN-O0-NEXT: s_add_u32 s14, s2, s3 +; GCN-O0-NEXT: s_addc_u32 s0, s0, s1 +; GCN-O0-NEXT: ; kill: def $sgpr14 killed $sgpr14 def $sgpr14_sgpr15 +; GCN-O0-NEXT: s_mov_b32 s15, s0 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_mov_b32_e32 v0, s14 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s15 +; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] +; GCN-O0-NEXT: s_mov_b64 s[6:7], 8 +; GCN-O0-NEXT: ; implicit-def: $vgpr12 : SGPR spill to VGPR lane +; GCN-O0-NEXT: v_writelane_b32 v12, s6, 0 +; GCN-O0-NEXT: v_writelane_b32 v12, s7, 1 +; GCN-O0-NEXT: s_mov_b32 s2, s14 +; GCN-O0-NEXT: s_mov_b32 s0, s15 +; GCN-O0-NEXT: s_mov_b32 s3, s6 +; GCN-O0-NEXT: s_mov_b32 s1, s7 +; GCN-O0-NEXT: s_add_u32 s78, s2, s3 +; GCN-O0-NEXT: s_addc_u32 s0, s0, s1 +; GCN-O0-NEXT: ; kill: def $sgpr78 killed $sgpr78 def $sgpr78_sgpr79 +; GCN-O0-NEXT: s_mov_b32 s79, s0 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_mov_b32_e32 v0, s78 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s79 +; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readfirstlane_b32 s17, v0 +; GCN-O0-NEXT: s_mov_b32 s53, 1 +; GCN-O0-NEXT: s_and_b32 s7, s17, s53 +; GCN-O0-NEXT: s_mov_b64 s[18:19], 4 +; GCN-O0-NEXT: v_writelane_b32 v12, s18, 2 +; GCN-O0-NEXT: v_writelane_b32 v12, s19, 3 +; GCN-O0-NEXT: s_mov_b32 s0, s14 +; GCN-O0-NEXT: s_mov_b32 s1, s15 +; GCN-O0-NEXT: s_mov_b32 s3, s18 +; GCN-O0-NEXT: s_mov_b32 s2, s19 +; GCN-O0-NEXT: s_add_u32 s0, s0, s3 +; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 +; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readfirstlane_b32 s44, v0 +; GCN-O0-NEXT: s_and_b32 s10, s44, s53 +; GCN-O0-NEXT: s_mov_b64 s[64:65], 2 +; GCN-O0-NEXT: s_mov_b32 s2, s14 +; GCN-O0-NEXT: s_mov_b32 s3, s15 +; GCN-O0-NEXT: s_mov_b32 s8, s64 +; GCN-O0-NEXT: s_mov_b32 s6, s65 +; GCN-O0-NEXT: s_add_u32 s2, s2, s8 ; GCN-O0-NEXT: s_addc_u32 s6, s3, s6 ; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 ; GCN-O0-NEXT: s_mov_b32 s3, s6 +; GCN-O0-NEXT: v_writelane_b32 v12, s2, 4 +; GCN-O0-NEXT: v_writelane_b32 v12, s3, 5 ; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 ; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 ; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v8, v0, s1 -; GCN-O0-NEXT: v_bfe_u32 v9, v0, 1, 1 -; GCN-O0-NEXT: v_bfe_u32 v10, v0, 2, 1 -; GCN-O0-NEXT: v_bfe_u32 v11, v0, 3, 1 -; GCN-O0-NEXT: v_bfe_u32 v12, v0, 4, 1 -; GCN-O0-NEXT: v_bfe_u32 v13, v0, 5, 1 -; GCN-O0-NEXT: v_bfe_u32 v14, v0, 6, 1 -; GCN-O0-NEXT: v_lshrrev_b32_e64 v15, s0, v0 -; GCN-O0-NEXT: s_mov_b64 s[8:9], 54 -; GCN-O0-NEXT: s_mov_b32 s2, s4 -; GCN-O0-NEXT: s_mov_b32 s3, s5 -; GCN-O0-NEXT: s_mov_b32 s7, s8 -; GCN-O0-NEXT: s_mov_b32 s6, s9 -; GCN-O0-NEXT: s_add_u32 s2, s2, s7 -; GCN-O0-NEXT: s_addc_u32 s6, s3, s6 -; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: v_readfirstlane_b32 s9, v0 +; GCN-O0-NEXT: s_and_b32 s11, s9, s53 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s14 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s15 ; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v16, v0, s1 -; GCN-O0-NEXT: v_bfe_u32 v17, v0, 1, 1 -; GCN-O0-NEXT: v_bfe_u32 v18, v0, 2, 1 -; GCN-O0-NEXT: v_bfe_u32 v19, v0, 3, 1 -; GCN-O0-NEXT: v_bfe_u32 v20, v0, 4, 1 -; GCN-O0-NEXT: v_bfe_u32 v21, v0, 5, 1 -; GCN-O0-NEXT: v_bfe_u32 v22, v0, 6, 1 -; GCN-O0-NEXT: v_lshrrev_b32_e64 v23, s0, v0 -; GCN-O0-NEXT: s_mov_b64 s[8:9], 55 -; GCN-O0-NEXT: s_mov_b32 s2, s4 -; GCN-O0-NEXT: s_mov_b32 s3, s5 -; GCN-O0-NEXT: s_mov_b32 s7, s8 -; GCN-O0-NEXT: s_mov_b32 s6, s9 -; GCN-O0-NEXT: s_add_u32 s2, s2, s7 -; GCN-O0-NEXT: s_addc_u32 s6, s3, s6 -; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: v_readfirstlane_b32 s2, v0 +; GCN-O0-NEXT: s_mov_b32 s34, 0xff +; GCN-O0-NEXT: s_and_b32 s3, s2, s34 +; GCN-O0-NEXT: s_mov_b32 s16, 0xffff +; GCN-O0-NEXT: v_writelane_b32 v12, s16, 6 +; GCN-O0-NEXT: s_and_b32 s36, s16, s3 +; GCN-O0-NEXT: s_lshr_b32 s3, s36, s53 +; GCN-O0-NEXT: s_and_b32 s3, s3, s53 +; GCN-O0-NEXT: v_writelane_b32 v12, s3, 7 +; GCN-O0-NEXT: s_and_b32 s25, s2, s53 +; GCN-O0-NEXT: s_mov_b32 s12, 2 +; GCN-O0-NEXT: s_lshr_b32 s2, s36, s12 +; GCN-O0-NEXT: s_and_b32 s27, s2, s53 +; GCN-O0-NEXT: s_mov_b32 s2, 4 +; GCN-O0-NEXT: s_lshr_b32 s3, s36, s2 +; GCN-O0-NEXT: s_and_b32 s30, s3, s53 +; GCN-O0-NEXT: s_mov_b64 s[28:29], 1 +; GCN-O0-NEXT: s_mov_b32 s8, s14 +; GCN-O0-NEXT: s_mov_b32 s3, s15 +; GCN-O0-NEXT: s_mov_b32 s13, s28 +; GCN-O0-NEXT: s_mov_b32 s6, s29 +; GCN-O0-NEXT: s_add_u32 s14, s8, s13 +; GCN-O0-NEXT: s_addc_u32 s3, s3, s6 +; GCN-O0-NEXT: ; kill: def $sgpr14 killed $sgpr14 def $sgpr14_sgpr15 +; GCN-O0-NEXT: s_mov_b32 s15, s3 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s14 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s15 ; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v24, v0, s1 -; GCN-O0-NEXT: v_bfe_u32 v25, v0, 1, 1 -; GCN-O0-NEXT: v_bfe_u32 v26, v0, 2, 1 -; GCN-O0-NEXT: v_bfe_u32 v27, v0, 3, 1 -; GCN-O0-NEXT: v_bfe_u32 v28, v0, 4, 1 -; GCN-O0-NEXT: v_bfe_u32 v29, v0, 5, 1 -; GCN-O0-NEXT: v_bfe_u32 v30, v0, 6, 1 -; GCN-O0-NEXT: v_lshrrev_b32_e64 v31, s0, v0 -; GCN-O0-NEXT: s_mov_b64 s[8:9], 56 -; GCN-O0-NEXT: s_mov_b32 s2, s4 -; GCN-O0-NEXT: s_mov_b32 s3, s5 -; GCN-O0-NEXT: s_mov_b32 s7, s8 -; GCN-O0-NEXT: s_mov_b32 s6, s9 -; GCN-O0-NEXT: s_add_u32 s2, s2, s7 -; GCN-O0-NEXT: s_addc_u32 s6, s3, s6 -; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: v_readfirstlane_b32 s6, v0 +; GCN-O0-NEXT: s_and_b32 s31, s6, s53 +; GCN-O0-NEXT: s_mov_b32 s13, s78 +; GCN-O0-NEXT: s_mov_b32 s3, s79 +; GCN-O0-NEXT: s_mov_b32 s14, s18 +; GCN-O0-NEXT: s_mov_b32 s8, s19 +; GCN-O0-NEXT: s_add_u32 s18, s13, s14 +; GCN-O0-NEXT: s_addc_u32 s3, s3, s8 +; GCN-O0-NEXT: ; kill: def $sgpr18 killed $sgpr18 def $sgpr18_sgpr19 +; GCN-O0-NEXT: s_mov_b32 s19, s3 +; GCN-O0-NEXT: s_mov_b32 s13, s18 +; GCN-O0-NEXT: s_mov_b32 s3, s19 +; GCN-O0-NEXT: s_mov_b32 s14, s64 +; GCN-O0-NEXT: s_mov_b32 s8, s65 +; GCN-O0-NEXT: s_add_u32 s14, s13, s14 +; GCN-O0-NEXT: s_addc_u32 s3, s3, s8 +; GCN-O0-NEXT: ; kill: def $sgpr14 killed $sgpr14 def $sgpr14_sgpr15 +; GCN-O0-NEXT: s_mov_b32 s15, s3 +; GCN-O0-NEXT: s_mov_b32 s13, s14 +; GCN-O0-NEXT: s_mov_b32 s3, s15 +; GCN-O0-NEXT: s_mov_b32 s20, s28 +; GCN-O0-NEXT: s_mov_b32 s8, s29 +; GCN-O0-NEXT: s_add_u32 s20, s13, s20 +; GCN-O0-NEXT: s_addc_u32 s3, s3, s8 +; GCN-O0-NEXT: ; kill: def $sgpr20 killed $sgpr20 def $sgpr20_sgpr21 +; GCN-O0-NEXT: s_mov_b32 s21, s3 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s20 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s21 ; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v32, v0, s1 -; GCN-O0-NEXT: v_bfe_u32 v33, v0, 1, 1 -; GCN-O0-NEXT: v_bfe_u32 v34, v0, 2, 1 -; GCN-O0-NEXT: v_bfe_u32 v35, v0, 3, 1 -; GCN-O0-NEXT: v_bfe_u32 v36, v0, 4, 1 -; GCN-O0-NEXT: v_bfe_u32 v37, v0, 5, 1 -; GCN-O0-NEXT: v_bfe_u32 v38, v0, 6, 1 -; GCN-O0-NEXT: v_lshrrev_b32_e64 v39, s0, v0 -; GCN-O0-NEXT: s_mov_b64 s[8:9], 57 -; GCN-O0-NEXT: s_mov_b32 s2, s4 -; GCN-O0-NEXT: s_mov_b32 s3, s5 -; GCN-O0-NEXT: s_mov_b32 s7, s8 -; GCN-O0-NEXT: s_mov_b32 s6, s9 -; GCN-O0-NEXT: s_add_u32 s2, s2, s7 -; GCN-O0-NEXT: s_addc_u32 s6, s3, s6 -; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: v_readfirstlane_b32 s8, v0 +; GCN-O0-NEXT: s_and_b32 s3, s8, s34 +; GCN-O0-NEXT: s_and_b32 s3, s16, s3 +; GCN-O0-NEXT: s_lshr_b32 s13, s3, s53 +; GCN-O0-NEXT: s_and_b32 s37, s13, s53 +; GCN-O0-NEXT: s_and_b32 s41, s8, s53 +; GCN-O0-NEXT: s_lshr_b32 s8, s3, s12 +; GCN-O0-NEXT: s_and_b32 s42, s8, s53 +; GCN-O0-NEXT: s_mov_b32 s26, 3 +; GCN-O0-NEXT: s_lshr_b32 s8, s3, s26 +; GCN-O0-NEXT: s_and_b32 s43, s8, s53 +; GCN-O0-NEXT: s_lshr_b32 s8, s3, s2 +; GCN-O0-NEXT: s_and_b32 s47, s8, s53 +; GCN-O0-NEXT: s_mov_b32 s24, 5 +; GCN-O0-NEXT: s_lshr_b32 s8, s3, s24 +; GCN-O0-NEXT: s_and_b32 s49, s8, s53 +; GCN-O0-NEXT: s_mov_b32 s8, 6 +; GCN-O0-NEXT: s_lshr_b32 s13, s3, s8 +; GCN-O0-NEXT: s_and_b32 s50, s13, s53 +; GCN-O0-NEXT: s_mov_b32 s21, 7 +; GCN-O0-NEXT: v_writelane_b32 v12, s21, 8 +; GCN-O0-NEXT: s_lshr_b32 s3, s3, s21 +; GCN-O0-NEXT: s_and_b32 s51, s3, s53 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s14 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s15 ; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v40, v0, s1 -; GCN-O0-NEXT: v_bfe_u32 v41, v0, 1, 1 -; GCN-O0-NEXT: v_bfe_u32 v42, v0, 2, 1 -; GCN-O0-NEXT: v_bfe_u32 v43, v0, 3, 1 -; GCN-O0-NEXT: v_bfe_u32 v44, v0, 4, 1 -; GCN-O0-NEXT: v_bfe_u32 v45, v0, 5, 1 -; GCN-O0-NEXT: v_bfe_u32 v46, v0, 6, 1 -; GCN-O0-NEXT: v_lshrrev_b32_e64 v47, s0, v0 -; GCN-O0-NEXT: s_mov_b64 s[8:9], 58 -; GCN-O0-NEXT: s_mov_b32 s2, s4 -; GCN-O0-NEXT: s_mov_b32 s3, s5 -; GCN-O0-NEXT: s_mov_b32 s7, s8 -; GCN-O0-NEXT: s_mov_b32 s6, s9 -; GCN-O0-NEXT: s_add_u32 s2, s2, s7 -; GCN-O0-NEXT: s_addc_u32 s6, s3, s6 -; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: v_readfirstlane_b32 s3, v0 +; GCN-O0-NEXT: s_and_b32 s13, s3, s34 +; GCN-O0-NEXT: s_and_b32 s13, s16, s13 +; GCN-O0-NEXT: s_lshr_b32 s14, s13, s53 +; GCN-O0-NEXT: s_and_b32 s35, s14, s53 +; GCN-O0-NEXT: s_and_b32 s54, s3, s53 +; GCN-O0-NEXT: s_lshr_b32 s3, s13, s12 +; GCN-O0-NEXT: s_and_b32 s55, s3, s53 +; GCN-O0-NEXT: s_lshr_b32 s3, s13, s26 +; GCN-O0-NEXT: s_and_b32 s58, s3, s53 +; GCN-O0-NEXT: s_lshr_b32 s3, s13, s2 +; GCN-O0-NEXT: s_and_b32 s3, s3, s53 +; GCN-O0-NEXT: s_lshr_b32 s14, s13, s24 +; GCN-O0-NEXT: s_and_b32 s59, s14, s53 +; GCN-O0-NEXT: s_lshr_b32 s14, s13, s8 +; GCN-O0-NEXT: s_and_b32 s60, s14, s53 +; GCN-O0-NEXT: s_lshr_b32 s13, s13, s21 +; GCN-O0-NEXT: s_and_b32 s61, s13, s53 +; GCN-O0-NEXT: s_mov_b32 s14, s18 +; GCN-O0-NEXT: s_mov_b32 s13, s19 +; GCN-O0-NEXT: s_mov_b32 s20, s28 +; GCN-O0-NEXT: s_mov_b32 s15, s29 +; GCN-O0-NEXT: s_add_u32 s14, s14, s20 +; GCN-O0-NEXT: s_addc_u32 s13, s13, s15 +; GCN-O0-NEXT: ; kill: def $sgpr14 killed $sgpr14 def $sgpr14_sgpr15 +; GCN-O0-NEXT: s_mov_b32 s15, s13 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s14 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s15 ; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v48, v0, s1 -; GCN-O0-NEXT: v_bfe_u32 v49, v0, 1, 1 -; GCN-O0-NEXT: v_bfe_u32 v50, v0, 2, 1 -; GCN-O0-NEXT: v_bfe_u32 v51, v0, 3, 1 -; GCN-O0-NEXT: v_bfe_u32 v52, v0, 4, 1 -; GCN-O0-NEXT: v_bfe_u32 v53, v0, 5, 1 -; GCN-O0-NEXT: v_bfe_u32 v54, v0, 6, 1 -; GCN-O0-NEXT: v_lshrrev_b32_e64 v55, s0, v0 -; GCN-O0-NEXT: s_mov_b64 s[8:9], 59 -; GCN-O0-NEXT: s_mov_b32 s2, s4 -; GCN-O0-NEXT: s_mov_b32 s3, s5 -; GCN-O0-NEXT: s_mov_b32 s7, s8 -; GCN-O0-NEXT: s_mov_b32 s6, s9 -; GCN-O0-NEXT: s_add_u32 s2, s2, s7 -; GCN-O0-NEXT: s_addc_u32 s6, s3, s6 -; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: v_readfirstlane_b32 s15, v0 +; GCN-O0-NEXT: s_and_b32 s13, s15, s34 +; GCN-O0-NEXT: s_and_b32 s14, s16, s13 +; GCN-O0-NEXT: s_lshr_b32 s13, s14, s53 +; GCN-O0-NEXT: s_and_b32 s13, s13, s53 +; GCN-O0-NEXT: s_and_b32 s40, s15, s53 +; GCN-O0-NEXT: s_lshr_b32 s15, s14, s12 +; GCN-O0-NEXT: s_and_b32 s63, s15, s53 +; GCN-O0-NEXT: s_lshr_b32 s15, s14, s26 +; GCN-O0-NEXT: s_and_b32 s66, s15, s53 +; GCN-O0-NEXT: s_lshr_b32 s15, s14, s2 +; GCN-O0-NEXT: s_and_b32 s15, s15, s53 +; GCN-O0-NEXT: s_lshr_b32 s20, s14, s24 +; GCN-O0-NEXT: s_and_b32 s67, s20, s53 +; GCN-O0-NEXT: s_lshr_b32 s20, s14, s8 +; GCN-O0-NEXT: s_and_b32 s69, s20, s53 +; GCN-O0-NEXT: s_lshr_b32 s14, s14, s21 +; GCN-O0-NEXT: s_and_b32 s71, s14, s53 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s18 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s19 ; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v56, v0, s1 -; GCN-O0-NEXT: v_bfe_u32 v57, v0, 1, 1 -; GCN-O0-NEXT: v_bfe_u32 v58, v0, 2, 1 -; GCN-O0-NEXT: v_bfe_u32 v59, v0, 3, 1 -; GCN-O0-NEXT: v_bfe_u32 v60, v0, 4, 1 -; GCN-O0-NEXT: v_bfe_u32 v61, v0, 5, 1 -; GCN-O0-NEXT: v_bfe_u32 v62, v0, 6, 1 -; GCN-O0-NEXT: v_lshrrev_b32_e64 v63, s0, v0 -; GCN-O0-NEXT: s_mov_b64 s[8:9], 60 -; GCN-O0-NEXT: s_mov_b32 s2, s4 -; GCN-O0-NEXT: s_mov_b32 s3, s5 -; GCN-O0-NEXT: s_mov_b32 s7, s8 -; GCN-O0-NEXT: s_mov_b32 s6, s9 -; GCN-O0-NEXT: s_add_u32 s2, s2, s7 -; GCN-O0-NEXT: s_addc_u32 s6, s3, s6 -; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: v_readfirstlane_b32 s19, v0 +; GCN-O0-NEXT: s_and_b32 s14, s19, s34 +; GCN-O0-NEXT: s_and_b32 s18, s16, s14 +; GCN-O0-NEXT: s_lshr_b32 s14, s18, s53 +; GCN-O0-NEXT: s_and_b32 s14, s14, s53 +; GCN-O0-NEXT: s_and_b32 s19, s19, s53 +; GCN-O0-NEXT: s_lshr_b32 s20, s18, s12 +; GCN-O0-NEXT: s_and_b32 s72, s20, s53 +; GCN-O0-NEXT: s_lshr_b32 s20, s18, s26 +; GCN-O0-NEXT: s_and_b32 s73, s20, s53 +; GCN-O0-NEXT: s_lshr_b32 s20, s18, s2 +; GCN-O0-NEXT: s_and_b32 s20, s20, s53 +; GCN-O0-NEXT: s_lshr_b32 s22, s18, s24 +; GCN-O0-NEXT: s_and_b32 s74, s22, s53 +; GCN-O0-NEXT: s_lshr_b32 s22, s18, s8 +; GCN-O0-NEXT: s_and_b32 s75, s22, s53 +; GCN-O0-NEXT: s_lshr_b32 s18, s18, s21 +; GCN-O0-NEXT: s_and_b32 s76, s18, s53 +; GCN-O0-NEXT: s_mov_b32 s23, s78 +; GCN-O0-NEXT: s_mov_b32 s18, s79 +; GCN-O0-NEXT: s_mov_b32 s33, s64 +; GCN-O0-NEXT: s_mov_b32 s22, s65 +; GCN-O0-NEXT: s_add_u32 s38, s23, s33 +; GCN-O0-NEXT: s_addc_u32 s18, s18, s22 +; GCN-O0-NEXT: ; kill: def $sgpr38 killed $sgpr38 def $sgpr38_sgpr39 +; GCN-O0-NEXT: s_mov_b32 s39, s18 +; GCN-O0-NEXT: s_mov_b32 s22, s38 +; GCN-O0-NEXT: s_mov_b32 s18, s39 +; GCN-O0-NEXT: s_mov_b32 s33, s28 +; GCN-O0-NEXT: s_mov_b32 s23, s29 +; GCN-O0-NEXT: s_add_u32 s22, s22, s33 +; GCN-O0-NEXT: s_addc_u32 s18, s18, s23 +; GCN-O0-NEXT: ; kill: def $sgpr22 killed $sgpr22 def $sgpr22_sgpr23 +; GCN-O0-NEXT: s_mov_b32 s23, s18 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s22 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s23 ; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v1, v0, s1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:392 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_bfe_u32 v1, v0, 1, 1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:396 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_bfe_u32 v1, v0, 2, 1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:400 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_bfe_u32 v1, v0, 3, 1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:404 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_bfe_u32 v1, v0, 4, 1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:408 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_bfe_u32 v1, v0, 5, 1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:412 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_bfe_u32 v1, v0, 6, 1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:416 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_lshrrev_b32_e64 v0, s0, v0 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:420 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 s[8:9], 61 -; GCN-O0-NEXT: s_mov_b32 s2, s4 -; GCN-O0-NEXT: s_mov_b32 s3, s5 -; GCN-O0-NEXT: s_mov_b32 s7, s8 -; GCN-O0-NEXT: s_mov_b32 s6, s9 -; GCN-O0-NEXT: s_add_u32 s2, s2, s7 -; GCN-O0-NEXT: s_addc_u32 s6, s3, s6 -; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: v_readfirstlane_b32 s23, v0 +; GCN-O0-NEXT: s_and_b32 s18, s23, s34 +; GCN-O0-NEXT: s_and_b32 s18, s16, s18 +; GCN-O0-NEXT: s_lshr_b32 s22, s18, s53 +; GCN-O0-NEXT: s_and_b32 s22, s22, s53 +; GCN-O0-NEXT: s_and_b32 s23, s23, s53 +; GCN-O0-NEXT: s_lshr_b32 s33, s18, s12 +; GCN-O0-NEXT: s_and_b32 s77, s33, s53 +; GCN-O0-NEXT: s_lshr_b32 s33, s18, s26 +; GCN-O0-NEXT: s_and_b32 s81, s33, s53 +; GCN-O0-NEXT: s_lshr_b32 s33, s18, s2 +; GCN-O0-NEXT: s_and_b32 s33, s33, s53 +; GCN-O0-NEXT: s_lshr_b32 s45, s18, s24 +; GCN-O0-NEXT: s_and_b32 s83, s45, s53 +; GCN-O0-NEXT: s_lshr_b32 s45, s18, s8 +; GCN-O0-NEXT: s_and_b32 s84, s45, s53 +; GCN-O0-NEXT: s_lshr_b32 s18, s18, s21 +; GCN-O0-NEXT: s_and_b32 s85, s18, s53 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s38 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s39 ; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v1, v0, s1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:424 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_bfe_u32 v1, v0, 1, 1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:428 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_bfe_u32 v1, v0, 2, 1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:432 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_bfe_u32 v1, v0, 3, 1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:436 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_bfe_u32 v1, v0, 4, 1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:440 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_bfe_u32 v1, v0, 5, 1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:444 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_bfe_u32 v1, v0, 6, 1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:448 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_lshrrev_b32_e64 v0, s0, v0 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:452 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 s[8:9], 62 -; GCN-O0-NEXT: s_mov_b32 s2, s4 -; GCN-O0-NEXT: s_mov_b32 s3, s5 -; GCN-O0-NEXT: s_mov_b32 s7, s8 -; GCN-O0-NEXT: s_mov_b32 s6, s9 -; GCN-O0-NEXT: s_add_u32 s2, s2, s7 -; GCN-O0-NEXT: s_addc_u32 s6, s3, s6 -; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: v_readfirstlane_b32 s39, v0 +; GCN-O0-NEXT: s_and_b32 s18, s39, s34 +; GCN-O0-NEXT: s_and_b32 s18, s16, s18 +; GCN-O0-NEXT: s_lshr_b32 s38, s18, s53 +; GCN-O0-NEXT: s_and_b32 s38, s38, s53 +; GCN-O0-NEXT: s_and_b32 s39, s39, s53 +; GCN-O0-NEXT: s_lshr_b32 s45, s18, s12 +; GCN-O0-NEXT: s_and_b32 s56, s45, s53 +; GCN-O0-NEXT: s_lshr_b32 s45, s18, s26 +; GCN-O0-NEXT: s_and_b32 s87, s45, s53 +; GCN-O0-NEXT: s_lshr_b32 s45, s18, s2 +; GCN-O0-NEXT: s_and_b32 s57, s45, s53 +; GCN-O0-NEXT: s_lshr_b32 s45, s18, s24 +; GCN-O0-NEXT: s_and_b32 s89, s45, s53 +; GCN-O0-NEXT: s_lshr_b32 s45, s18, s8 +; GCN-O0-NEXT: s_and_b32 s91, s45, s53 +; GCN-O0-NEXT: s_lshr_b32 s18, s18, s21 +; GCN-O0-NEXT: s_and_b32 s93, s18, s53 +; GCN-O0-NEXT: s_mov_b32 s46, s78 +; GCN-O0-NEXT: s_mov_b32 s18, s79 +; GCN-O0-NEXT: s_mov_b32 s48, s28 +; GCN-O0-NEXT: s_mov_b32 s45, s29 +; GCN-O0-NEXT: s_add_u32 s78, s46, s48 +; GCN-O0-NEXT: s_addc_u32 s18, s18, s45 +; GCN-O0-NEXT: ; kill: def $sgpr78 killed $sgpr78 def $sgpr78_sgpr79 +; GCN-O0-NEXT: s_mov_b32 s79, s18 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s78 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s79 ; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v1, v0, s1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:456 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_bfe_u32 v1, v0, 1, 1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:460 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_bfe_u32 v1, v0, 2, 1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:464 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_bfe_u32 v1, v0, 3, 1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:468 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_bfe_u32 v1, v0, 4, 1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:472 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_bfe_u32 v1, v0, 5, 1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:476 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_bfe_u32 v1, v0, 6, 1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:480 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_lshrrev_b32_e64 v0, s0, v0 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:484 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 s[8:9], 63 -; GCN-O0-NEXT: s_mov_b32 s2, s4 -; GCN-O0-NEXT: s_mov_b32 s3, s5 -; GCN-O0-NEXT: s_mov_b32 s7, s8 -; GCN-O0-NEXT: s_mov_b32 s6, s9 -; GCN-O0-NEXT: s_add_u32 s2, s2, s7 -; GCN-O0-NEXT: s_addc_u32 s6, s3, s6 -; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: v_readfirstlane_b32 s18, v0 +; GCN-O0-NEXT: s_and_b32 s45, s18, s34 +; GCN-O0-NEXT: s_and_b32 s45, s16, s45 +; GCN-O0-NEXT: s_lshr_b32 s46, s45, s53 +; GCN-O0-NEXT: s_and_b32 s90, s46, s53 +; GCN-O0-NEXT: s_and_b32 s92, s18, s53 +; GCN-O0-NEXT: s_lshr_b32 s18, s45, s12 +; GCN-O0-NEXT: s_and_b32 s94, s18, s53 +; GCN-O0-NEXT: s_lshr_b32 s18, s45, s26 +; GCN-O0-NEXT: s_and_b32 vcc_lo, s18, s53 +; GCN-O0-NEXT: s_lshr_b32 s18, s45, s2 +; GCN-O0-NEXT: s_and_b32 s95, s18, s53 +; GCN-O0-NEXT: s_lshr_b32 s18, s45, s24 +; GCN-O0-NEXT: s_and_b32 s18, s18, s53 +; GCN-O0-NEXT: s_lshr_b32 s46, s45, s8 +; GCN-O0-NEXT: s_and_b32 s88, s46, s53 +; GCN-O0-NEXT: s_lshr_b32 s45, s45, s21 +; GCN-O0-NEXT: s_and_b32 vcc_hi, s45, s53 +; GCN-O0-NEXT: s_and_b32 s17, s17, s34 +; GCN-O0-NEXT: s_and_b32 s16, s16, s17 +; GCN-O0-NEXT: s_lshr_b32 s17, s16, s53 +; GCN-O0-NEXT: s_and_b32 s17, s17, s53 +; GCN-O0-NEXT: v_writelane_b32 v12, s17, 9 +; GCN-O0-NEXT: s_lshr_b32 s17, s16, s12 +; GCN-O0-NEXT: s_and_b32 s17, s17, s53 +; GCN-O0-NEXT: v_writelane_b32 v12, s17, 10 +; GCN-O0-NEXT: s_lshr_b32 s17, s16, s26 +; GCN-O0-NEXT: s_and_b32 s17, s17, s53 +; GCN-O0-NEXT: v_writelane_b32 v12, s17, 11 +; GCN-O0-NEXT: s_lshr_b32 s17, s16, s2 +; GCN-O0-NEXT: s_and_b32 s17, s17, s53 +; GCN-O0-NEXT: v_writelane_b32 v12, s17, 12 +; GCN-O0-NEXT: s_lshr_b32 s17, s16, s24 +; GCN-O0-NEXT: s_and_b32 s17, s17, s53 +; GCN-O0-NEXT: v_writelane_b32 v12, s17, 13 +; GCN-O0-NEXT: s_lshr_b32 s17, s16, s8 +; GCN-O0-NEXT: s_and_b32 s17, s17, s53 +; GCN-O0-NEXT: v_writelane_b32 v12, s17, 14 +; GCN-O0-NEXT: s_lshr_b32 s16, s16, s21 +; GCN-O0-NEXT: s_and_b32 s16, s16, s53 +; GCN-O0-NEXT: v_writelane_b32 v12, s16, 15 +; GCN-O0-NEXT: s_mov_b32 s45, s0 +; GCN-O0-NEXT: s_mov_b32 s16, s1 +; GCN-O0-NEXT: s_mov_b32 s46, s64 +; GCN-O0-NEXT: s_mov_b32 s17, s65 +; GCN-O0-NEXT: s_add_u32 s78, s45, s46 +; GCN-O0-NEXT: s_addc_u32 s16, s16, s17 +; GCN-O0-NEXT: ; kill: def $sgpr78 killed $sgpr78 def $sgpr78_sgpr79 +; GCN-O0-NEXT: s_mov_b32 s79, s16 +; GCN-O0-NEXT: s_mov_b32 s16, s78 +; GCN-O0-NEXT: s_mov_b32 s17, s79 +; GCN-O0-NEXT: s_mov_b32 s46, s28 +; GCN-O0-NEXT: s_mov_b32 s45, s29 +; GCN-O0-NEXT: s_add_u32 s16, s16, s46 +; GCN-O0-NEXT: s_addc_u32 s45, s17, s45 +; GCN-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17 +; GCN-O0-NEXT: s_mov_b32 s17, s45 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s16 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s17 +; GCN-O0-NEXT: v_readlane_b32 s16, v12, 6 ; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v1, v0, s1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:488 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_bfe_u32 v1, v0, 1, 1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:492 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_bfe_u32 v1, v0, 2, 1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:496 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_bfe_u32 v1, v0, 3, 1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:500 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_bfe_u32 v1, v0, 4, 1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:504 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_bfe_u32 v1, v0, 5, 1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:508 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_bfe_u32 v1, v0, 6, 1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:512 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_lshrrev_b32_e64 v0, s0, v0 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:516 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 s[8:9], 64 -; GCN-O0-NEXT: s_mov_b32 s2, s4 -; GCN-O0-NEXT: s_mov_b32 s3, s5 -; GCN-O0-NEXT: s_mov_b32 s7, s8 -; GCN-O0-NEXT: s_mov_b32 s6, s9 -; GCN-O0-NEXT: s_add_u32 s2, s2, s7 -; GCN-O0-NEXT: s_addc_u32 s6, s3, s6 -; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: v_readfirstlane_b32 s45, v0 +; GCN-O0-NEXT: s_and_b32 s17, s45, s34 +; GCN-O0-NEXT: s_and_b32 s17, s16, s17 +; GCN-O0-NEXT: s_lshr_b32 s46, s17, s53 +; GCN-O0-NEXT: s_and_b32 s46, s46, s53 +; GCN-O0-NEXT: v_writelane_b32 v12, s46, 16 +; GCN-O0-NEXT: s_and_b32 s45, s45, s53 +; GCN-O0-NEXT: v_writelane_b32 v12, s45, 17 +; GCN-O0-NEXT: s_lshr_b32 s45, s17, s12 +; GCN-O0-NEXT: s_and_b32 s45, s45, s53 +; GCN-O0-NEXT: v_writelane_b32 v12, s45, 18 +; GCN-O0-NEXT: s_lshr_b32 s45, s17, s26 +; GCN-O0-NEXT: s_and_b32 s45, s45, s53 +; GCN-O0-NEXT: v_writelane_b32 v12, s45, 19 +; GCN-O0-NEXT: s_lshr_b32 s45, s17, s2 +; GCN-O0-NEXT: s_and_b32 s45, s45, s53 +; GCN-O0-NEXT: v_writelane_b32 v12, s45, 20 +; GCN-O0-NEXT: s_lshr_b32 s45, s17, s24 +; GCN-O0-NEXT: s_and_b32 s45, s45, s53 +; GCN-O0-NEXT: v_writelane_b32 v12, s45, 21 +; GCN-O0-NEXT: s_lshr_b32 s45, s17, s8 +; GCN-O0-NEXT: s_and_b32 s45, s45, s53 +; GCN-O0-NEXT: v_writelane_b32 v12, s45, 22 +; GCN-O0-NEXT: s_lshr_b32 s17, s17, s21 +; GCN-O0-NEXT: s_and_b32 s17, s17, s53 +; GCN-O0-NEXT: v_writelane_b32 v12, s17, 23 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s78 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s79 ; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v1, v0, s1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:520 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_bfe_u32 v1, v0, 1, 1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:524 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_bfe_u32 v1, v0, 2, 1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:528 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_bfe_u32 v1, v0, 3, 1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:532 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_bfe_u32 v1, v0, 4, 1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:536 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_bfe_u32 v1, v0, 5, 1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:540 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_bfe_u32 v1, v0, 6, 1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:544 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_lshrrev_b32_e64 v0, s0, v0 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:548 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 s[8:9], 0x41 -; GCN-O0-NEXT: s_mov_b32 s2, s4 -; GCN-O0-NEXT: s_mov_b32 s3, s5 -; GCN-O0-NEXT: s_mov_b32 s7, s8 -; GCN-O0-NEXT: s_mov_b32 s6, s9 -; GCN-O0-NEXT: s_add_u32 s2, s2, s7 -; GCN-O0-NEXT: s_addc_u32 s6, s3, s6 -; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: v_readfirstlane_b32 s17, v0 +; GCN-O0-NEXT: s_and_b32 s45, s17, s34 +; GCN-O0-NEXT: s_and_b32 s16, s16, s45 +; GCN-O0-NEXT: s_lshr_b32 s45, s16, s53 +; GCN-O0-NEXT: s_and_b32 s45, s45, s53 +; GCN-O0-NEXT: v_writelane_b32 v12, s45, 24 +; GCN-O0-NEXT: s_and_b32 s17, s17, s53 +; GCN-O0-NEXT: v_writelane_b32 v12, s17, 25 +; GCN-O0-NEXT: s_lshr_b32 s17, s16, s12 +; GCN-O0-NEXT: s_and_b32 s17, s17, s53 +; GCN-O0-NEXT: v_writelane_b32 v12, s17, 26 +; GCN-O0-NEXT: s_lshr_b32 s17, s16, s26 +; GCN-O0-NEXT: s_and_b32 s17, s17, s53 +; GCN-O0-NEXT: v_writelane_b32 v12, s17, 27 +; GCN-O0-NEXT: s_lshr_b32 s17, s16, s2 +; GCN-O0-NEXT: s_and_b32 s78, s17, s53 +; GCN-O0-NEXT: s_lshr_b32 s17, s16, s24 +; GCN-O0-NEXT: s_and_b32 s17, s17, s53 +; GCN-O0-NEXT: v_writelane_b32 v12, s17, 28 +; GCN-O0-NEXT: s_lshr_b32 s17, s16, s8 +; GCN-O0-NEXT: s_and_b32 s17, s17, s53 +; GCN-O0-NEXT: v_writelane_b32 v12, s17, 29 +; GCN-O0-NEXT: s_lshr_b32 s16, s16, s21 +; GCN-O0-NEXT: s_and_b32 s16, s16, s53 +; GCN-O0-NEXT: v_writelane_b32 v12, s16, 30 +; GCN-O0-NEXT: s_mov_b32 s16, s0 +; GCN-O0-NEXT: s_mov_b32 s17, s1 +; GCN-O0-NEXT: v_readlane_b32 s0, v12, 4 +; GCN-O0-NEXT: v_readlane_b32 s1, v12, 5 +; GCN-O0-NEXT: s_mov_b32 s46, s28 +; GCN-O0-NEXT: s_mov_b32 s45, s29 +; GCN-O0-NEXT: s_add_u32 s16, s16, s46 +; GCN-O0-NEXT: s_addc_u32 s45, s17, s45 +; GCN-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17 +; GCN-O0-NEXT: s_mov_b32 s17, s45 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s16 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s17 +; GCN-O0-NEXT: v_readlane_b32 s17, v12, 7 +; GCN-O0-NEXT: v_readlane_b32 s16, v12, 6 ; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v1, v0, s1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:552 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_bfe_u32 v1, v0, 1, 1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:556 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_bfe_u32 v1, v0, 2, 1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:560 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_bfe_u32 v1, v0, 3, 1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:564 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_bfe_u32 v1, v0, 4, 1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:568 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_bfe_u32 v1, v0, 5, 1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:572 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_bfe_u32 v1, v0, 6, 1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:576 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_lshrrev_b32_e64 v0, s0, v0 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:580 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 s[8:9], 0x42 -; GCN-O0-NEXT: s_mov_b32 s2, s4 -; GCN-O0-NEXT: s_mov_b32 s3, s5 -; GCN-O0-NEXT: s_mov_b32 s7, s8 -; GCN-O0-NEXT: s_mov_b32 s6, s9 -; GCN-O0-NEXT: s_add_u32 s2, s2, s7 -; GCN-O0-NEXT: s_addc_u32 s6, s3, s6 -; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: v_readfirstlane_b32 s46, v0 +; GCN-O0-NEXT: s_and_b32 s45, s46, s34 +; GCN-O0-NEXT: s_and_b32 s45, s16, s45 +; GCN-O0-NEXT: s_lshr_b32 s48, s45, s53 +; GCN-O0-NEXT: s_and_b32 s48, s48, s53 +; GCN-O0-NEXT: v_writelane_b32 v12, s48, 31 +; GCN-O0-NEXT: s_and_b32 s46, s46, s53 +; GCN-O0-NEXT: v_writelane_b32 v12, s46, 32 +; GCN-O0-NEXT: s_lshr_b32 s46, s45, s12 +; GCN-O0-NEXT: s_and_b32 s46, s46, s53 +; GCN-O0-NEXT: v_writelane_b32 v12, s46, 33 +; GCN-O0-NEXT: s_lshr_b32 s46, s45, s26 +; GCN-O0-NEXT: s_and_b32 s46, s46, s53 +; GCN-O0-NEXT: v_writelane_b32 v12, s46, 34 +; GCN-O0-NEXT: s_lshr_b32 s46, s45, s2 +; GCN-O0-NEXT: s_and_b32 s46, s46, s53 +; GCN-O0-NEXT: v_writelane_b32 v12, s46, 35 +; GCN-O0-NEXT: s_lshr_b32 s46, s45, s24 +; GCN-O0-NEXT: s_and_b32 s46, s46, s53 +; GCN-O0-NEXT: v_writelane_b32 v12, s46, 36 +; GCN-O0-NEXT: s_lshr_b32 s46, s45, s8 +; GCN-O0-NEXT: s_and_b32 s46, s46, s53 +; GCN-O0-NEXT: v_writelane_b32 v12, s46, 37 +; GCN-O0-NEXT: s_lshr_b32 s45, s45, s21 +; GCN-O0-NEXT: s_and_b32 s45, s45, s53 +; GCN-O0-NEXT: v_writelane_b32 v12, s45, 38 +; GCN-O0-NEXT: s_and_b32 s44, s44, s34 +; GCN-O0-NEXT: s_and_b32 s16, s16, s44 +; GCN-O0-NEXT: s_lshr_b32 s44, s16, s53 +; GCN-O0-NEXT: s_and_b32 s44, s44, s53 +; GCN-O0-NEXT: v_writelane_b32 v12, s44, 39 +; GCN-O0-NEXT: s_lshr_b32 s44, s16, s12 +; GCN-O0-NEXT: s_and_b32 s44, s44, s53 +; GCN-O0-NEXT: v_writelane_b32 v12, s44, 40 +; GCN-O0-NEXT: s_lshr_b32 s44, s16, s26 +; GCN-O0-NEXT: s_and_b32 s44, s44, s53 +; GCN-O0-NEXT: v_writelane_b32 v12, s44, 41 +; GCN-O0-NEXT: s_lshr_b32 s44, s16, s2 +; GCN-O0-NEXT: s_and_b32 s79, s44, s53 +; GCN-O0-NEXT: s_lshr_b32 s44, s16, s24 +; GCN-O0-NEXT: s_and_b32 s44, s44, s53 +; GCN-O0-NEXT: v_writelane_b32 v12, s44, 42 +; GCN-O0-NEXT: s_lshr_b32 s44, s16, s8 +; GCN-O0-NEXT: s_and_b32 s44, s44, s53 +; GCN-O0-NEXT: v_writelane_b32 v12, s44, 43 +; GCN-O0-NEXT: s_lshr_b32 s16, s16, s21 +; GCN-O0-NEXT: s_and_b32 s16, s16, s53 +; GCN-O0-NEXT: v_writelane_b32 v12, s16, 44 +; GCN-O0-NEXT: s_mov_b32 s44, s0 +; GCN-O0-NEXT: s_mov_b32 s16, s1 +; GCN-O0-NEXT: v_readlane_b32 s1, v12, 8 +; GCN-O0-NEXT: v_readlane_b32 s0, v12, 6 +; GCN-O0-NEXT: s_mov_b32 s45, s28 +; GCN-O0-NEXT: s_mov_b32 s21, s29 +; GCN-O0-NEXT: s_add_u32 s44, s44, s45 +; GCN-O0-NEXT: s_addc_u32 s16, s16, s21 +; GCN-O0-NEXT: ; kill: def $sgpr44 killed $sgpr44 def $sgpr44_sgpr45 +; GCN-O0-NEXT: s_mov_b32 s45, s16 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s44 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s45 ; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v1, v0, s1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:584 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_bfe_u32 v1, v0, 1, 1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:588 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_bfe_u32 v1, v0, 2, 1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:592 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_bfe_u32 v1, v0, 3, 1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:596 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_bfe_u32 v1, v0, 4, 1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:600 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_bfe_u32 v1, v0, 5, 1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:604 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_bfe_u32 v1, v0, 6, 1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:608 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_lshrrev_b32_e64 v0, s0, v0 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:612 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 s[8:9], 0x43 -; GCN-O0-NEXT: s_mov_b32 s2, s4 -; GCN-O0-NEXT: s_mov_b32 s3, s5 -; GCN-O0-NEXT: s_mov_b32 s7, s8 -; GCN-O0-NEXT: s_mov_b32 s6, s9 -; GCN-O0-NEXT: s_add_u32 s2, s2, s7 -; GCN-O0-NEXT: s_addc_u32 s6, s3, s6 -; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 -; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:648 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(1) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:644 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_and_b32_e64 v0, v0, s1 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:616 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:644 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_bfe_u32 v0, v0, 1, 1 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:620 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:644 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_bfe_u32 v0, v0, 2, 1 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:624 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:644 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_bfe_u32 v0, v0, 3, 1 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:628 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:644 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_bfe_u32 v0, v0, 4, 1 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:632 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:644 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_bfe_u32 v0, v0, 5, 1 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:636 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:644 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_bfe_u32 v0, v0, 6, 1 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:640 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:644 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_lshrrev_b32_e64 v0, s0, v0 +; GCN-O0-NEXT: v_readfirstlane_b32 s44, v0 +; GCN-O0-NEXT: s_and_b32 s16, s44, s34 +; GCN-O0-NEXT: s_and_b32 s16, s0, s16 +; GCN-O0-NEXT: s_lshr_b32 s21, s16, s53 +; GCN-O0-NEXT: s_and_b32 s21, s21, s53 +; GCN-O0-NEXT: s_and_b32 s82, s44, s53 +; GCN-O0-NEXT: s_lshr_b32 s44, s16, s12 +; GCN-O0-NEXT: s_and_b32 s86, s44, s53 +; GCN-O0-NEXT: s_lshr_b32 s44, s16, s26 +; GCN-O0-NEXT: s_and_b32 s80, s44, s53 +; GCN-O0-NEXT: s_lshr_b32 s44, s16, s2 +; GCN-O0-NEXT: s_and_b32 s70, s44, s53 +; GCN-O0-NEXT: s_lshr_b32 s44, s16, s24 +; GCN-O0-NEXT: s_and_b32 s44, s44, s53 +; GCN-O0-NEXT: v_writelane_b32 v12, s44, 45 +; GCN-O0-NEXT: s_lshr_b32 s44, s16, s8 +; GCN-O0-NEXT: s_and_b32 s44, s44, s53 +; GCN-O0-NEXT: v_writelane_b32 v12, s44, 46 +; GCN-O0-NEXT: s_lshr_b32 s16, s16, s1 +; GCN-O0-NEXT: s_and_b32 s16, s16, s53 +; GCN-O0-NEXT: v_writelane_b32 v12, s16, 47 +; GCN-O0-NEXT: s_and_b32 s9, s9, s34 +; GCN-O0-NEXT: s_and_b32 s9, s0, s9 +; GCN-O0-NEXT: s_lshr_b32 s16, s9, s53 +; GCN-O0-NEXT: s_and_b32 s44, s16, s53 +; GCN-O0-NEXT: s_lshr_b32 s16, s9, s12 +; GCN-O0-NEXT: s_and_b32 s62, s16, s53 +; GCN-O0-NEXT: s_lshr_b32 s16, s9, s26 +; GCN-O0-NEXT: s_and_b32 s68, s16, s53 +; GCN-O0-NEXT: s_lshr_b32 s16, s9, s2 +; GCN-O0-NEXT: s_and_b32 s45, s16, s53 +; GCN-O0-NEXT: s_lshr_b32 s16, s9, s24 +; GCN-O0-NEXT: s_and_b32 s16, s16, s53 +; GCN-O0-NEXT: v_writelane_b32 v12, s16, 48 +; GCN-O0-NEXT: s_lshr_b32 s16, s9, s8 +; GCN-O0-NEXT: s_and_b32 s16, s16, s53 +; GCN-O0-NEXT: v_writelane_b32 v12, s16, 49 +; GCN-O0-NEXT: s_lshr_b32 s9, s9, s1 +; GCN-O0-NEXT: s_and_b32 s9, s9, s53 +; GCN-O0-NEXT: v_writelane_b32 v12, s9, 50 +; GCN-O0-NEXT: s_lshr_b32 s9, s36, s26 +; GCN-O0-NEXT: s_and_b32 s9, s9, s53 +; GCN-O0-NEXT: s_lshr_b32 s16, s36, s24 +; GCN-O0-NEXT: s_and_b32 s16, s16, s53 +; GCN-O0-NEXT: s_lshr_b32 s46, s36, s8 +; GCN-O0-NEXT: s_and_b32 s48, s46, s53 +; GCN-O0-NEXT: s_lshr_b32 s36, s36, s1 +; GCN-O0-NEXT: s_and_b32 s52, s36, s53 +; GCN-O0-NEXT: s_and_b32 s6, s6, s34 +; GCN-O0-NEXT: s_and_b32 s0, s0, s6 +; GCN-O0-NEXT: s_lshr_b32 s6, s0, s53 +; GCN-O0-NEXT: s_and_b32 s6, s6, s53 +; GCN-O0-NEXT: v_writelane_b32 v12, s6, 51 +; GCN-O0-NEXT: s_lshr_b32 s6, s0, s12 +; GCN-O0-NEXT: s_and_b32 s6, s6, s53 +; GCN-O0-NEXT: s_lshr_b32 s26, s0, s26 +; GCN-O0-NEXT: s_and_b32 s46, s26, s53 +; GCN-O0-NEXT: s_lshr_b32 s26, s0, s2 +; GCN-O0-NEXT: s_and_b32 s36, s26, s53 +; GCN-O0-NEXT: s_lshr_b32 s24, s0, s24 +; GCN-O0-NEXT: s_and_b32 s24, s24, s53 +; GCN-O0-NEXT: v_writelane_b32 v12, s24, 52 +; GCN-O0-NEXT: s_lshr_b32 s8, s0, s8 +; GCN-O0-NEXT: s_and_b32 s8, s8, s53 +; GCN-O0-NEXT: v_writelane_b32 v12, s8, 53 +; GCN-O0-NEXT: s_lshr_b32 s0, s0, s1 +; GCN-O0-NEXT: s_and_b32 s0, s0, s53 +; GCN-O0-NEXT: v_writelane_b32 v12, s0, 54 ; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GCN-O0-NEXT: s_load_dword s2, s[4:5], 0x44 -; GCN-O0-NEXT: s_mov_b32 s3, 0x7f ; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: s_and_b32 s3, s2, s3 -; GCN-O0-NEXT: s_mov_b32 s2, 0 -; GCN-O0-NEXT: s_add_i32 s2, s2, s3 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:127 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:640 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:126 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:636 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:125 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:632 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:124 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:628 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:123 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:624 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:122 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:620 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:121 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:616 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:120 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:612 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:119 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:608 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:118 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:604 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:117 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:600 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:116 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:596 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:115 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:592 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:114 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:588 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:113 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:584 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:112 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:580 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:111 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:576 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:110 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:572 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:109 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:568 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:108 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:564 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:107 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:560 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:106 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:556 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:105 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:552 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:104 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:548 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:103 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:544 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:102 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:540 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:101 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:536 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:100 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:532 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:99 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:528 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:98 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:524 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:97 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:520 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:96 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:516 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:95 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:512 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:94 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:508 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:93 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:504 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:92 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:500 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:91 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:496 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:90 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:492 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:89 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:488 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:88 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:484 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:87 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:480 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:86 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:476 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:85 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:472 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:84 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:468 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:83 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:464 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:82 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:460 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:81 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:456 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:80 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:452 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:79 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:448 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:78 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:444 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:77 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:440 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:76 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:436 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:75 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:432 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:74 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:428 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:73 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:424 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:72 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:420 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:71 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:416 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:70 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:412 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:69 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:408 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:68 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:404 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:67 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:400 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:66 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:396 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:65 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:392 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:64 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:388 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_store_byte v63, off, s[16:19], 0 offset:63 -; GCN-O0-NEXT: buffer_store_byte v62, off, s[16:19], 0 offset:62 -; GCN-O0-NEXT: buffer_store_byte v61, off, s[16:19], 0 offset:61 -; GCN-O0-NEXT: buffer_store_byte v60, off, s[16:19], 0 offset:60 -; GCN-O0-NEXT: buffer_store_byte v59, off, s[16:19], 0 offset:59 -; GCN-O0-NEXT: buffer_store_byte v58, off, s[16:19], 0 offset:58 -; GCN-O0-NEXT: buffer_store_byte v57, off, s[16:19], 0 offset:57 -; GCN-O0-NEXT: buffer_store_byte v56, off, s[16:19], 0 offset:56 -; GCN-O0-NEXT: buffer_store_byte v55, off, s[16:19], 0 offset:55 -; GCN-O0-NEXT: buffer_store_byte v54, off, s[16:19], 0 offset:54 -; GCN-O0-NEXT: buffer_store_byte v53, off, s[16:19], 0 offset:53 -; GCN-O0-NEXT: buffer_store_byte v52, off, s[16:19], 0 offset:52 -; GCN-O0-NEXT: buffer_store_byte v51, off, s[16:19], 0 offset:51 -; GCN-O0-NEXT: buffer_store_byte v50, off, s[16:19], 0 offset:50 -; GCN-O0-NEXT: buffer_store_byte v49, off, s[16:19], 0 offset:49 -; GCN-O0-NEXT: buffer_store_byte v48, off, s[16:19], 0 offset:48 -; GCN-O0-NEXT: buffer_store_byte v47, off, s[16:19], 0 offset:47 -; GCN-O0-NEXT: buffer_store_byte v46, off, s[16:19], 0 offset:46 -; GCN-O0-NEXT: buffer_store_byte v45, off, s[16:19], 0 offset:45 -; GCN-O0-NEXT: buffer_store_byte v44, off, s[16:19], 0 offset:44 -; GCN-O0-NEXT: buffer_store_byte v43, off, s[16:19], 0 offset:43 -; GCN-O0-NEXT: buffer_store_byte v42, off, s[16:19], 0 offset:42 -; GCN-O0-NEXT: buffer_store_byte v41, off, s[16:19], 0 offset:41 -; GCN-O0-NEXT: buffer_store_byte v40, off, s[16:19], 0 offset:40 -; GCN-O0-NEXT: buffer_store_byte v39, off, s[16:19], 0 offset:39 -; GCN-O0-NEXT: buffer_store_byte v38, off, s[16:19], 0 offset:38 -; GCN-O0-NEXT: buffer_store_byte v37, off, s[16:19], 0 offset:37 -; GCN-O0-NEXT: buffer_store_byte v36, off, s[16:19], 0 offset:36 -; GCN-O0-NEXT: buffer_store_byte v35, off, s[16:19], 0 offset:35 -; GCN-O0-NEXT: buffer_store_byte v34, off, s[16:19], 0 offset:34 -; GCN-O0-NEXT: buffer_store_byte v33, off, s[16:19], 0 offset:33 -; GCN-O0-NEXT: buffer_store_byte v32, off, s[16:19], 0 offset:32 -; GCN-O0-NEXT: buffer_store_byte v31, off, s[16:19], 0 offset:31 -; GCN-O0-NEXT: buffer_store_byte v30, off, s[16:19], 0 offset:30 -; GCN-O0-NEXT: buffer_store_byte v29, off, s[16:19], 0 offset:29 -; GCN-O0-NEXT: buffer_store_byte v28, off, s[16:19], 0 offset:28 -; GCN-O0-NEXT: buffer_store_byte v27, off, s[16:19], 0 offset:27 -; GCN-O0-NEXT: buffer_store_byte v26, off, s[16:19], 0 offset:26 -; GCN-O0-NEXT: buffer_store_byte v25, off, s[16:19], 0 offset:25 -; GCN-O0-NEXT: buffer_store_byte v24, off, s[16:19], 0 offset:24 -; GCN-O0-NEXT: buffer_store_byte v23, off, s[16:19], 0 offset:23 -; GCN-O0-NEXT: buffer_store_byte v22, off, s[16:19], 0 offset:22 -; GCN-O0-NEXT: buffer_store_byte v21, off, s[16:19], 0 offset:21 -; GCN-O0-NEXT: buffer_store_byte v20, off, s[16:19], 0 offset:20 -; GCN-O0-NEXT: buffer_store_byte v19, off, s[16:19], 0 offset:19 -; GCN-O0-NEXT: buffer_store_byte v18, off, s[16:19], 0 offset:18 -; GCN-O0-NEXT: buffer_store_byte v17, off, s[16:19], 0 offset:17 -; GCN-O0-NEXT: buffer_store_byte v16, off, s[16:19], 0 offset:16 -; GCN-O0-NEXT: buffer_store_byte v15, off, s[16:19], 0 offset:15 -; GCN-O0-NEXT: buffer_store_byte v14, off, s[16:19], 0 offset:14 -; GCN-O0-NEXT: buffer_store_byte v13, off, s[16:19], 0 offset:13 -; GCN-O0-NEXT: buffer_store_byte v12, off, s[16:19], 0 offset:12 -; GCN-O0-NEXT: buffer_store_byte v11, off, s[16:19], 0 offset:11 -; GCN-O0-NEXT: buffer_store_byte v10, off, s[16:19], 0 offset:10 -; GCN-O0-NEXT: buffer_store_byte v9, off, s[16:19], 0 offset:9 -; GCN-O0-NEXT: buffer_store_byte v8, off, s[16:19], 0 offset:8 -; GCN-O0-NEXT: buffer_store_byte v7, off, s[16:19], 0 offset:7 -; GCN-O0-NEXT: buffer_store_byte v6, off, s[16:19], 0 offset:6 -; GCN-O0-NEXT: buffer_store_byte v5, off, s[16:19], 0 offset:5 -; GCN-O0-NEXT: buffer_store_byte v4, off, s[16:19], 0 offset:4 -; GCN-O0-NEXT: buffer_store_byte v3, off, s[16:19], 0 offset:3 -; GCN-O0-NEXT: buffer_store_byte v2, off, s[16:19], 0 offset:2 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[16:19], 0 offset:1 -; GCN-O0-NEXT: s_waitcnt vmcnt(14) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 -; GCN-O0-NEXT: v_mov_b32_e32 v3, 1 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: buffer_store_byte v3, v0, s[16:19], 0 offen -; GCN-O0-NEXT: buffer_load_ubyte v18, off, s[16:19], 0 offset:23 -; GCN-O0-NEXT: buffer_load_ubyte v19, off, s[16:19], 0 offset:22 -; GCN-O0-NEXT: buffer_load_ubyte v20, off, s[16:19], 0 offset:21 -; GCN-O0-NEXT: buffer_load_ubyte v21, off, s[16:19], 0 offset:20 -; GCN-O0-NEXT: buffer_load_ubyte v22, off, s[16:19], 0 offset:19 -; GCN-O0-NEXT: buffer_load_ubyte v23, off, s[16:19], 0 offset:18 -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:128 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v8, off, s[16:19], 0 offset:1 -; GCN-O0-NEXT: buffer_load_ubyte v7, off, s[16:19], 0 offset:2 -; GCN-O0-NEXT: buffer_load_ubyte v6, off, s[16:19], 0 offset:3 -; GCN-O0-NEXT: buffer_load_ubyte v5, off, s[16:19], 0 offset:4 -; GCN-O0-NEXT: buffer_load_ubyte v4, off, s[16:19], 0 offset:5 -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:6 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:140 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:7 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:136 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v9, off, s[16:19], 0 offset:8 -; GCN-O0-NEXT: buffer_load_ubyte v16, off, s[16:19], 0 offset:9 -; GCN-O0-NEXT: buffer_load_ubyte v15, off, s[16:19], 0 offset:10 -; GCN-O0-NEXT: buffer_load_ubyte v14, off, s[16:19], 0 offset:11 -; GCN-O0-NEXT: buffer_load_ubyte v13, off, s[16:19], 0 offset:12 -; GCN-O0-NEXT: buffer_load_ubyte v12, off, s[16:19], 0 offset:13 -; GCN-O0-NEXT: buffer_load_ubyte v11, off, s[16:19], 0 offset:14 -; GCN-O0-NEXT: buffer_load_ubyte v10, off, s[16:19], 0 offset:15 -; GCN-O0-NEXT: buffer_load_ubyte v17, off, s[16:19], 0 offset:16 -; GCN-O0-NEXT: buffer_load_ubyte v24, off, s[16:19], 0 offset:17 -; GCN-O0-NEXT: buffer_load_ubyte v26, off, s[16:19], 0 offset:31 -; GCN-O0-NEXT: buffer_load_ubyte v27, off, s[16:19], 0 offset:30 -; GCN-O0-NEXT: buffer_load_ubyte v28, off, s[16:19], 0 offset:29 -; GCN-O0-NEXT: buffer_load_ubyte v29, off, s[16:19], 0 offset:28 -; GCN-O0-NEXT: buffer_load_ubyte v30, off, s[16:19], 0 offset:27 -; GCN-O0-NEXT: buffer_load_ubyte v31, off, s[16:19], 0 offset:26 -; GCN-O0-NEXT: buffer_load_ubyte v32, off, s[16:19], 0 offset:25 -; GCN-O0-NEXT: buffer_load_ubyte v25, off, s[16:19], 0 offset:24 -; GCN-O0-NEXT: buffer_load_ubyte v34, off, s[16:19], 0 offset:39 -; GCN-O0-NEXT: buffer_load_ubyte v35, off, s[16:19], 0 offset:38 -; GCN-O0-NEXT: buffer_load_ubyte v36, off, s[16:19], 0 offset:37 -; GCN-O0-NEXT: buffer_load_ubyte v37, off, s[16:19], 0 offset:36 -; GCN-O0-NEXT: buffer_load_ubyte v38, off, s[16:19], 0 offset:35 -; GCN-O0-NEXT: buffer_load_ubyte v39, off, s[16:19], 0 offset:34 -; GCN-O0-NEXT: buffer_load_ubyte v40, off, s[16:19], 0 offset:33 -; GCN-O0-NEXT: buffer_load_ubyte v33, off, s[16:19], 0 offset:32 -; GCN-O0-NEXT: buffer_load_ubyte v42, off, s[16:19], 0 offset:47 -; GCN-O0-NEXT: buffer_load_ubyte v43, off, s[16:19], 0 offset:46 -; GCN-O0-NEXT: buffer_load_ubyte v44, off, s[16:19], 0 offset:45 -; GCN-O0-NEXT: buffer_load_ubyte v45, off, s[16:19], 0 offset:44 -; GCN-O0-NEXT: buffer_load_ubyte v46, off, s[16:19], 0 offset:43 -; GCN-O0-NEXT: buffer_load_ubyte v47, off, s[16:19], 0 offset:42 -; GCN-O0-NEXT: buffer_load_ubyte v48, off, s[16:19], 0 offset:41 -; GCN-O0-NEXT: buffer_load_ubyte v41, off, s[16:19], 0 offset:40 -; GCN-O0-NEXT: buffer_load_ubyte v50, off, s[16:19], 0 offset:55 -; GCN-O0-NEXT: buffer_load_ubyte v51, off, s[16:19], 0 offset:54 -; GCN-O0-NEXT: buffer_load_ubyte v52, off, s[16:19], 0 offset:53 -; GCN-O0-NEXT: buffer_load_ubyte v53, off, s[16:19], 0 offset:52 -; GCN-O0-NEXT: buffer_load_ubyte v54, off, s[16:19], 0 offset:51 -; GCN-O0-NEXT: buffer_load_ubyte v55, off, s[16:19], 0 offset:50 -; GCN-O0-NEXT: buffer_load_ubyte v56, off, s[16:19], 0 offset:49 -; GCN-O0-NEXT: buffer_load_ubyte v49, off, s[16:19], 0 offset:48 -; GCN-O0-NEXT: buffer_load_ubyte v58, off, s[16:19], 0 offset:63 -; GCN-O0-NEXT: buffer_load_ubyte v59, off, s[16:19], 0 offset:62 -; GCN-O0-NEXT: buffer_load_ubyte v60, off, s[16:19], 0 offset:61 -; GCN-O0-NEXT: buffer_load_ubyte v61, off, s[16:19], 0 offset:60 -; GCN-O0-NEXT: buffer_load_ubyte v62, off, s[16:19], 0 offset:59 -; GCN-O0-NEXT: buffer_load_ubyte v63, off, s[16:19], 0 offset:58 -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:57 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:132 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v57, off, s[16:19], 0 offset:56 -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:71 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:144 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:70 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:172 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:69 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:148 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:68 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:152 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:67 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:156 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:66 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:160 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:65 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:168 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:64 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:164 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:79 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:176 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:78 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:204 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:77 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:180 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:76 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:184 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:75 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:188 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:74 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:192 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:73 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:200 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:72 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:196 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:87 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:208 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:86 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:236 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:85 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:212 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:84 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:216 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:83 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:220 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:82 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:224 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:81 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:232 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:80 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:228 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:95 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:240 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:94 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:268 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:93 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:244 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:92 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:248 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:91 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:252 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:90 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:256 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:89 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:264 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:88 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:260 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:103 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:272 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:102 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:300 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:101 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:276 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:100 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:280 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:99 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:284 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:98 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:288 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:97 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:296 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:96 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:292 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:111 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:304 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:110 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:332 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:109 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:308 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:108 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:312 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:107 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:316 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:106 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:320 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:105 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:328 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:104 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:324 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:119 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:336 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:118 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:364 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:117 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:340 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:116 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:344 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:115 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:348 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:114 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:352 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:113 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:360 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:112 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:356 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:127 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:368 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v2, off, s[16:19], 0 offset:126 -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:125 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:372 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:124 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:376 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:123 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:380 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:122 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:384 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v1, off, s[16:19], 0 offset:121 -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:120 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v0, v0, v3 -; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, v3, v1 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:384 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 -; GCN-O0-NEXT: s_mov_b32 s7, 2 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s7, v1 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:380 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 -; GCN-O0-NEXT: s_mov_b32 s6, 3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s6, v1 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:376 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 -; GCN-O0-NEXT: s_mov_b32 s5, 4 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s5, v1 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:372 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 -; GCN-O0-NEXT: s_mov_b32 s4, 5 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s4, v1 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:368 ; 4-byte Folded Reload -; GCN-O0-NEXT: v_and_b32_e64 v2, v2, v3 +; GCN-O0-NEXT: v_writelane_b32 v12, s0, 55 +; GCN-O0-NEXT: v_writelane_b32 v12, s1, 56 +; GCN-O0-NEXT: v_readlane_b32 s1, v12, 16 +; GCN-O0-NEXT: v_readlane_b32 s0, v12, 54 +; GCN-O0-NEXT: s_load_dword s5, s[4:5], 0x44 +; GCN-O0-NEXT: v_readlane_b32 s4, v12, 51 +; GCN-O0-NEXT: s_mov_b32 s8, 0x7f +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_and_b32 s5, s5, s8 +; GCN-O0-NEXT: s_mul_i32 s5, s5, s53 +; GCN-O0-NEXT: s_mov_b32 s24, 0 +; GCN-O0-NEXT: v_writelane_b32 v12, s24, 57 +; GCN-O0-NEXT: s_add_i32 s5, s24, s5 +; GCN-O0-NEXT: s_mov_b32 s8, 8 +; GCN-O0-NEXT: s_add_i32 s34, s24, s8 +; GCN-O0-NEXT: s_add_i32 s26, s34, s2 +; GCN-O0-NEXT: s_add_i32 s24, s26, s12 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_readlane_b32 s0, v12, 53 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s24 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_readlane_b32 s0, v12, 52 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s26 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:2 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_readlane_b32 s0, v12, 50 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s26 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s36 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s34 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:4 +; GCN-O0-NEXT: s_add_i32 s36, s34, s12 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s46 +; GCN-O0-NEXT: v_readlane_b32 s46, v12, 57 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s36 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s34 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:2 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s34 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: s_add_i32 s6, s46, s2 +; GCN-O0-NEXT: s_add_i32 s4, s6, s12 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s52 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s4 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s48 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s6 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:2 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s16 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s6 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: s_add_i32 s16, s46, s12 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s9 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s16 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: s_mov_b32 s9, 16 +; GCN-O0-NEXT: s_add_i32 s52, s46, s9 +; GCN-O0-NEXT: s_add_i32 s48, s52, s2 +; GCN-O0-NEXT: s_add_i32 s46, s48, s12 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_readlane_b32 s0, v12, 49 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s46 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_readlane_b32 s0, v12, 48 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s48 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:2 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_readlane_b32 s0, v12, 47 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s48 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s45 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s52 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:4 +; GCN-O0-NEXT: s_add_i32 s45, s52, s12 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s68 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s45 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s62 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s52 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:2 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s44 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s52 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: s_add_i32 s68, s52, s8 +; GCN-O0-NEXT: s_add_i32 s62, s68, s2 +; GCN-O0-NEXT: s_add_i32 s44, s62, s12 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_readlane_b32 s0, v12, 46 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s44 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_readlane_b32 s0, v12, 45 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s62 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:2 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_readlane_b32 s0, v12, 44 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s62 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s70 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s68 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:4 +; GCN-O0-NEXT: s_add_i32 s70, s68, s12 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s80 +; GCN-O0-NEXT: v_readlane_b32 s80, v12, 57 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s70 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s86 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s68 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:2 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s82 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s52 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:8 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s21 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s68 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: s_mov_b32 s21, 32 +; GCN-O0-NEXT: s_add_i32 s86, s80, s21 +; GCN-O0-NEXT: s_add_i32 s82, s86, s2 +; GCN-O0-NEXT: s_add_i32 s80, s82, s12 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_readlane_b32 s0, v12, 43 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s80 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_readlane_b32 s0, v12, 42 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s82 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:2 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_readlane_b32 s0, v12, 41 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s82 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s79 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s86 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:4 +; GCN-O0-NEXT: s_add_i32 s79, s86, s12 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_readlane_b32 s0, v12, 40 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s79 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_readlane_b32 s0, v12, 39 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s86 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:2 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s86 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: s_add_i32 s0, s86, s8 +; GCN-O0-NEXT: v_writelane_b32 v12, s0, 58 +; GCN-O0-NEXT: s_add_i32 s0, s0, s2 +; GCN-O0-NEXT: v_writelane_b32 v12, s0, 59 +; GCN-O0-NEXT: s_add_i32 s0, s0, s12 +; GCN-O0-NEXT: v_writelane_b32 v12, s0, 60 +; GCN-O0-NEXT: v_readlane_b32 s0, v12, 38 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_readlane_b32 s0, v12, 60 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s0 +; GCN-O0-NEXT: v_readlane_b32 s0, v12, 37 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_readlane_b32 s0, v12, 59 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s0 +; GCN-O0-NEXT: v_readlane_b32 s0, v12, 36 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:2 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_readlane_b32 s0, v12, 59 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s0 +; GCN-O0-NEXT: v_readlane_b32 s0, v12, 35 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_readlane_b32 s0, v12, 58 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s0 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:4 +; GCN-O0-NEXT: s_add_i32 s0, s0, s12 +; GCN-O0-NEXT: v_writelane_b32 v12, s0, 61 +; GCN-O0-NEXT: v_readlane_b32 s0, v12, 34 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_readlane_b32 s0, v12, 61 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s0 +; GCN-O0-NEXT: v_readlane_b32 s0, v12, 33 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_readlane_b32 s0, v12, 58 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s0 +; GCN-O0-NEXT: v_readlane_b32 s0, v12, 32 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:2 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_readlane_b32 s0, v12, 31 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s86 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:8 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_readlane_b32 s0, v12, 58 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s0 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: s_add_i32 s0, s86, s9 +; GCN-O0-NEXT: v_writelane_b32 v12, s0, 62 +; GCN-O0-NEXT: s_add_i32 s0, s0, s2 +; GCN-O0-NEXT: v_writelane_b32 v12, s0, 63 +; GCN-O0-NEXT: s_add_i32 s0, s0, s12 +; GCN-O0-NEXT: ; implicit-def: $vgpr11 : SGPR spill to VGPR lane +; GCN-O0-NEXT: v_writelane_b32 v11, s0, 0 +; GCN-O0-NEXT: v_readlane_b32 s0, v12, 30 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_readlane_b32 s0, v11, 0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s0 +; GCN-O0-NEXT: v_readlane_b32 s0, v12, 29 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_readlane_b32 s0, v12, 63 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s0 +; GCN-O0-NEXT: v_readlane_b32 s0, v12, 28 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:2 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_readlane_b32 s0, v12, 63 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s0 +; GCN-O0-NEXT: v_readlane_b32 s0, v12, 27 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s78 +; GCN-O0-NEXT: v_readlane_b32 s78, v12, 62 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s78 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:4 +; GCN-O0-NEXT: s_add_i32 s78, s78, s12 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_readlane_b32 s0, v12, 26 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s78 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_readlane_b32 s0, v12, 62 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s0 +; GCN-O0-NEXT: v_readlane_b32 s0, v12, 25 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:2 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_readlane_b32 s0, v12, 24 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s86 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:16 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_readlane_b32 s0, v12, 62 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s0 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: s_add_i32 s0, s0, s8 +; GCN-O0-NEXT: v_writelane_b32 v11, s0, 1 +; GCN-O0-NEXT: s_add_i32 s0, s0, s2 +; GCN-O0-NEXT: v_writelane_b32 v11, s0, 2 +; GCN-O0-NEXT: s_add_i32 s0, s0, s12 +; GCN-O0-NEXT: v_writelane_b32 v11, s0, 3 +; GCN-O0-NEXT: v_readlane_b32 s0, v12, 23 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_readlane_b32 s0, v11, 3 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s0 +; GCN-O0-NEXT: v_readlane_b32 s0, v12, 22 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_readlane_b32 s0, v11, 2 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s0 +; GCN-O0-NEXT: v_readlane_b32 s0, v12, 21 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:2 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_readlane_b32 s0, v11, 2 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s0 +; GCN-O0-NEXT: v_readlane_b32 s0, v12, 20 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_readlane_b32 s0, v11, 1 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s0 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:4 +; GCN-O0-NEXT: s_add_i32 s0, s0, s12 +; GCN-O0-NEXT: v_writelane_b32 v11, s0, 4 +; GCN-O0-NEXT: v_readlane_b32 s0, v12, 19 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_readlane_b32 s0, v11, 4 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s0 +; GCN-O0-NEXT: v_readlane_b32 s0, v12, 18 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_readlane_b32 s0, v11, 1 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s0 +; GCN-O0-NEXT: v_readlane_b32 s0, v12, 17 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:2 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_readlane_b32 s0, v12, 62 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s0 +; GCN-O0-NEXT: v_readlane_b32 s0, v12, 57 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:8 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s1 +; GCN-O0-NEXT: v_readlane_b32 s1, v11, 1 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: s_mov_b32 s1, 64 +; GCN-O0-NEXT: s_add_i32 s0, s0, s1 +; GCN-O0-NEXT: v_readlane_b32 s1, v12, 60 +; GCN-O0-NEXT: v_writelane_b32 v11, s0, 5 +; GCN-O0-NEXT: s_add_i32 s0, s0, s2 +; GCN-O0-NEXT: v_writelane_b32 v11, s0, 6 +; GCN-O0-NEXT: s_add_i32 s0, s0, s12 +; GCN-O0-NEXT: v_writelane_b32 v11, s0, 7 +; GCN-O0-NEXT: v_readlane_b32 s0, v12, 15 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_readlane_b32 s0, v11, 7 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s0 +; GCN-O0-NEXT: v_readlane_b32 s0, v12, 14 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_readlane_b32 s0, v11, 6 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s0 +; GCN-O0-NEXT: v_readlane_b32 s0, v12, 13 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:2 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_readlane_b32 s0, v11, 6 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s0 +; GCN-O0-NEXT: v_readlane_b32 s0, v12, 12 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_readlane_b32 s0, v11, 5 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s0 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:4 +; GCN-O0-NEXT: s_add_i32 s0, s0, s12 +; GCN-O0-NEXT: v_writelane_b32 v11, s0, 8 +; GCN-O0-NEXT: v_readlane_b32 s0, v12, 11 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_readlane_b32 s0, v11, 8 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s0 +; GCN-O0-NEXT: v_readlane_b32 s0, v12, 10 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_readlane_b32 s0, v11, 5 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s0 +; GCN-O0-NEXT: v_readlane_b32 s0, v12, 9 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:2 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_readlane_b32 s0, v11, 5 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s0 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: s_add_i32 s0, s0, s8 +; GCN-O0-NEXT: v_writelane_b32 v11, s0, 9 +; GCN-O0-NEXT: s_add_i32 s0, s0, s2 +; GCN-O0-NEXT: v_writelane_b32 v11, s0, 10 +; GCN-O0-NEXT: s_add_i32 s0, s0, s12 +; GCN-O0-NEXT: v_writelane_b32 v11, s0, 11 +; GCN-O0-NEXT: v_readlane_b32 s0, v12, 59 +; GCN-O0-NEXT: v_mov_b32_e32 v0, vcc_hi +; GCN-O0-NEXT: v_readlane_b32 vcc_hi, v11, 11 +; GCN-O0-NEXT: v_mov_b32_e32 v1, vcc_hi +; GCN-O0-NEXT: v_readlane_b32 vcc_hi, v11, 10 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s88 +; GCN-O0-NEXT: v_readlane_b32 s88, v11, 9 +; GCN-O0-NEXT: v_mov_b32_e32 v1, vcc_hi +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:2 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s18 +; GCN-O0-NEXT: v_readlane_b32 s18, v11, 5 +; GCN-O0-NEXT: v_mov_b32_e32 v1, vcc_hi +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s95 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s88 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:4 +; GCN-O0-NEXT: s_add_i32 s95, s88, s12 +; GCN-O0-NEXT: v_writelane_b32 v11, s95, 12 +; GCN-O0-NEXT: v_mov_b32_e32 v0, vcc_lo +; GCN-O0-NEXT: v_mov_b32_e32 v1, s95 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s94 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s88 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:2 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s92 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s18 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:8 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s90 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s88 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: s_add_i32 s90, s18, s9 +; GCN-O0-NEXT: s_add_i32 s88, s90, s2 +; GCN-O0-NEXT: v_writelane_b32 v11, s88, 13 +; GCN-O0-NEXT: s_add_i32 s92, s88, s12 +; GCN-O0-NEXT: v_writelane_b32 v11, s92, 14 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s93 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s92 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s91 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s88 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:2 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s89 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s88 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s57 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s90 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:4 +; GCN-O0-NEXT: s_add_i32 s57, s90, s12 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s87 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s57 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s56 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s90 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:2 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s39 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s18 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:16 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s38 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s90 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: s_add_i32 s38, s90, s8 +; GCN-O0-NEXT: s_add_i32 s39, s38, s2 +; GCN-O0-NEXT: s_add_i32 s56, s39, s12 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s85 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s56 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s84 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s39 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:2 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s83 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s39 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s33 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s38 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:4 +; GCN-O0-NEXT: s_add_i32 s33, s38, s12 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s81 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s33 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s77 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s38 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:2 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s23 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s90 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:8 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s22 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s38 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: s_add_i32 s21, s18, s21 +; GCN-O0-NEXT: s_add_i32 s22, s21, s2 +; GCN-O0-NEXT: s_add_i32 s23, s22, s12 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s76 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s23 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s75 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s22 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:2 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s74 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s22 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s20 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s21 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:4 +; GCN-O0-NEXT: s_add_i32 s20, s21, s12 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s73 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s20 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s72 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s21 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:2 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s19 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s18 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:32 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s14 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s21 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: s_add_i32 s14, s21, s8 +; GCN-O0-NEXT: s_add_i32 s18, s14, s2 +; GCN-O0-NEXT: s_add_i32 s19, s18, s12 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s71 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s19 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s69 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s18 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:2 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s67 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s18 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s15 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s14 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:4 +; GCN-O0-NEXT: s_add_i32 s15, s14, s12 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s66 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s15 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s63 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s14 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:2 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s40 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s21 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:8 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s13 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s14 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: s_add_i32 s40, s21, s9 +; GCN-O0-NEXT: v_writelane_b32 v11, s40, 15 +; GCN-O0-NEXT: s_add_i32 s9, s40, s2 +; GCN-O0-NEXT: s_add_i32 s13, s9, s12 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s61 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s13 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s60 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s9 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:2 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s59 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s9 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s3 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s40 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:4 +; GCN-O0-NEXT: s_add_i32 s3, s40, s12 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s58 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s55 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s40 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:2 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s54 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s21 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:16 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s35 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s40 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: s_add_i32 s35, s40, s8 +; GCN-O0-NEXT: v_writelane_b32 v11, s35, 16 +; GCN-O0-NEXT: s_add_i32 s8, s35, s2 +; GCN-O0-NEXT: s_add_i32 s2, s8, s12 +; GCN-O0-NEXT: s_add_i32 s2, s2, s53 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s51 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s2 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen +; GCN-O0-NEXT: v_mov_b32_e32 v0, s50 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s8 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:2 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s49 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s8 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s47 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s35 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:4 +; GCN-O0-NEXT: s_add_i32 s12, s35, s12 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s43 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s12 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s42 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s35 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:2 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s41 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s40 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:8 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s37 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s35 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s31 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[96:99], 0 offset:8 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s30 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[96:99], 0 offset:4 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s27 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[96:99], 0 offset:2 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s25 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[96:99], 0 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s17 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[96:99], 0 offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s11 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[96:99], 0 offset:16 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s10 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[96:99], 0 offset:32 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s7 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[96:99], 0 offset:64 +; GCN-O0-NEXT: v_mov_b32_e32 v0, 1 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[96:99], 0 offen +; GCN-O0-NEXT: v_mov_b32_e32 v1, s4 +; GCN-O0-NEXT: buffer_load_ubyte v1, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v1, 1, v1 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, 1 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s6 +; GCN-O0-NEXT: buffer_load_ubyte v1, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s6 +; GCN-O0-NEXT: buffer_load_ubyte v2, v2, s[96:99], 0 offen offset:2 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v2, 1, v2 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v2, 1 +; GCN-O0-NEXT: v_and_b32_e64 v1, 1, v1 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[10:11], v1, 1 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s16 +; GCN-O0-NEXT: buffer_load_ubyte v1, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v1, 1, v1 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[16:17], v1, 1 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s24 +; GCN-O0-NEXT: buffer_load_ubyte v1, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v1, 1, v1 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[24:25], v1, 1 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s26 +; GCN-O0-NEXT: buffer_load_ubyte v1, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s26 +; GCN-O0-NEXT: buffer_load_ubyte v2, v2, s[96:99], 0 offen offset:2 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v2, 1, v2 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[26:27], v2, 1 +; GCN-O0-NEXT: v_and_b32_e64 v1, 1, v1 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[30:31], v1, 1 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s34 +; GCN-O0-NEXT: buffer_load_ubyte v2, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s34 +; GCN-O0-NEXT: buffer_load_ubyte v1, v1, s[96:99], 0 offen offset:2 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s34 +; GCN-O0-NEXT: buffer_load_ubyte v3, v3, s[96:99], 0 offen offset:4 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v3, 1, v3 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[34:35], v3, 1 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s36 +; GCN-O0-NEXT: buffer_load_ubyte v3, v3, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v3, 1, v3 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[36:37], v3, 1 +; GCN-O0-NEXT: v_and_b32_e64 v2, 1, v2 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[40:41], v2, 1 +; GCN-O0-NEXT: v_and_b32_e64 v1, 1, v1 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[42:43], v1, 1 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s46 +; GCN-O0-NEXT: buffer_load_ubyte v1, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v1, 1, v1 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[46:47], v1, 1 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s48 +; GCN-O0-NEXT: buffer_load_ubyte v1, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s48 +; GCN-O0-NEXT: buffer_load_ubyte v2, v2, s[96:99], 0 offen offset:2 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v2, 1, v2 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[48:49], v2, 1 +; GCN-O0-NEXT: v_and_b32_e64 v1, 1, v1 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[50:51], v1, 1 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s52 +; GCN-O0-NEXT: buffer_load_ubyte v3, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s52 +; GCN-O0-NEXT: buffer_load_ubyte v1, v1, s[96:99], 0 offen offset:2 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s52 +; GCN-O0-NEXT: buffer_load_ubyte v4, v2, s[96:99], 0 offen offset:4 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s52 +; GCN-O0-NEXT: buffer_load_ubyte v2, v2, s[96:99], 0 offen offset:8 +; GCN-O0-NEXT: s_waitcnt vmcnt(1) +; GCN-O0-NEXT: v_and_b32_e64 v4, 1, v4 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[52:53], v4, 1 +; GCN-O0-NEXT: v_mov_b32_e32 v4, s45 +; GCN-O0-NEXT: buffer_load_ubyte v4, v4, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v4, 1, v4 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[54:55], v4, 1 +; GCN-O0-NEXT: v_and_b32_e64 v3, 1, v3 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[58:59], v3, 1 +; GCN-O0-NEXT: v_and_b32_e64 v1, 1, v1 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[60:61], v1, 1 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s44 +; GCN-O0-NEXT: buffer_load_ubyte v1, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v1, 1, v1 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[44:45], v1, 1 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s62 +; GCN-O0-NEXT: buffer_load_ubyte v1, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s62 +; GCN-O0-NEXT: buffer_load_ubyte v3, v3, s[96:99], 0 offen offset:2 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v3, 1, v3 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[62:63], v3, 1 +; GCN-O0-NEXT: v_and_b32_e64 v1, 1, v1 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[66:67], v1, 1 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s68 +; GCN-O0-NEXT: buffer_load_ubyte v1, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s68 +; GCN-O0-NEXT: buffer_load_ubyte v3, v3, s[96:99], 0 offen offset:2 +; GCN-O0-NEXT: v_mov_b32_e32 v4, s68 +; GCN-O0-NEXT: buffer_load_ubyte v4, v4, s[96:99], 0 offen offset:4 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v4, 1, v4 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[68:69], v4, 1 +; GCN-O0-NEXT: v_mov_b32_e32 v4, s70 +; GCN-O0-NEXT: buffer_load_ubyte v4, v4, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v4, 1, v4 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[70:71], v4, 1 +; GCN-O0-NEXT: v_and_b32_e64 v3, 1, v3 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[72:73], v3, 1 +; GCN-O0-NEXT: v_and_b32_e64 v2, 1, v2 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[74:75], v2, 1 +; GCN-O0-NEXT: v_and_b32_e64 v1, 1, v1 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[76:77], v1, 1 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s80 +; GCN-O0-NEXT: buffer_load_ubyte v1, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v1, 1, v1 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[80:81], v1, 1 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s82 +; GCN-O0-NEXT: buffer_load_ubyte v1, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s82 +; GCN-O0-NEXT: buffer_load_ubyte v2, v2, s[96:99], 0 offen offset:2 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v2, 1, v2 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[82:83], v2, 1 +; GCN-O0-NEXT: v_and_b32_e64 v1, 1, v1 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[84:85], v1, 1 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s86 +; GCN-O0-NEXT: buffer_load_ubyte v4, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s86 +; GCN-O0-NEXT: buffer_load_ubyte v1, v1, s[96:99], 0 offen offset:2 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s86 +; GCN-O0-NEXT: buffer_load_ubyte v5, v2, s[96:99], 0 offen offset:4 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s86 +; GCN-O0-NEXT: buffer_load_ubyte v2, v2, s[96:99], 0 offen offset:8 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s86 +; GCN-O0-NEXT: buffer_load_ubyte v3, v3, s[96:99], 0 offen offset:16 +; GCN-O0-NEXT: s_waitcnt vmcnt(2) +; GCN-O0-NEXT: v_and_b32_e64 v5, 1, v5 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[86:87], v5, 1 +; GCN-O0-NEXT: v_mov_b32_e32 v5, s79 +; GCN-O0-NEXT: buffer_load_ubyte v5, v5, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v5, 1, v5 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[88:89], v5, 1 +; GCN-O0-NEXT: v_and_b32_e64 v4, 1, v4 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[92:93], v4, 1 +; GCN-O0-NEXT: v_and_b32_e64 v1, 1, v1 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[94:95], v1, 1 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 +; GCN-O0-NEXT: buffer_load_ubyte v1, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v1, 1, v1 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 vcc, v1, 1 +; GCN-O0-NEXT: v_writelane_b32 v11, vcc_lo, 17 +; GCN-O0-NEXT: v_writelane_b32 v11, vcc_hi, 18 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s0 +; GCN-O0-NEXT: buffer_load_ubyte v1, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v4, s0 +; GCN-O0-NEXT: buffer_load_ubyte v4, v4, s[96:99], 0 offen offset:2 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v4, 1, v4 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 vcc, v4, 1 +; GCN-O0-NEXT: v_and_b32_e64 v1, 1, v1 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, 1 +; GCN-O0-NEXT: v_writelane_b32 v11, s0, 19 +; GCN-O0-NEXT: v_writelane_b32 v11, s1, 20 +; GCN-O0-NEXT: v_readlane_b32 s0, v12, 58 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s0 +; GCN-O0-NEXT: buffer_load_ubyte v1, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v4, s0 +; GCN-O0-NEXT: buffer_load_ubyte v4, v4, s[96:99], 0 offen offset:2 +; GCN-O0-NEXT: v_mov_b32_e32 v5, s0 +; GCN-O0-NEXT: buffer_load_ubyte v5, v5, s[96:99], 0 offen offset:4 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v5, 1, v5 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[0:1], v5, 1 +; GCN-O0-NEXT: v_writelane_b32 v11, s0, 21 +; GCN-O0-NEXT: v_writelane_b32 v11, s1, 22 +; GCN-O0-NEXT: v_readlane_b32 s0, v12, 61 +; GCN-O0-NEXT: v_mov_b32_e32 v5, s0 +; GCN-O0-NEXT: buffer_load_ubyte v5, v5, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v5, 1, v5 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[0:1], v5, 1 +; GCN-O0-NEXT: v_writelane_b32 v11, s0, 23 +; GCN-O0-NEXT: v_writelane_b32 v11, s1, 24 +; GCN-O0-NEXT: v_and_b32_e64 v4, 1, v4 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[0:1], v4, 1 +; GCN-O0-NEXT: v_writelane_b32 v11, s0, 25 +; GCN-O0-NEXT: v_writelane_b32 v11, s1, 26 +; GCN-O0-NEXT: v_and_b32_e64 v2, 1, v2 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, 1 +; GCN-O0-NEXT: v_writelane_b32 v11, s0, 27 +; GCN-O0-NEXT: v_writelane_b32 v11, s1, 28 +; GCN-O0-NEXT: v_and_b32_e64 v1, 1, v1 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, 1 +; GCN-O0-NEXT: v_writelane_b32 v11, s0, 29 +; GCN-O0-NEXT: v_writelane_b32 v11, s1, 30 +; GCN-O0-NEXT: v_readlane_b32 s0, v11, 0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s0 +; GCN-O0-NEXT: buffer_load_ubyte v1, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v1, 1, v1 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, 1 +; GCN-O0-NEXT: v_writelane_b32 v11, s0, 31 +; GCN-O0-NEXT: v_writelane_b32 v11, s1, 32 +; GCN-O0-NEXT: v_readlane_b32 s0, v12, 63 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s0 +; GCN-O0-NEXT: buffer_load_ubyte v1, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s0 +; GCN-O0-NEXT: buffer_load_ubyte v2, v2, s[96:99], 0 offen offset:2 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v2, 1, v2 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, 1 +; GCN-O0-NEXT: v_writelane_b32 v11, s0, 33 +; GCN-O0-NEXT: v_writelane_b32 v11, s1, 34 +; GCN-O0-NEXT: v_and_b32_e64 v1, 1, v1 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, 1 +; GCN-O0-NEXT: v_writelane_b32 v11, s0, 35 +; GCN-O0-NEXT: v_writelane_b32 v11, s1, 36 +; GCN-O0-NEXT: v_readlane_b32 s0, v12, 62 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s0 +; GCN-O0-NEXT: buffer_load_ubyte v1, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s0 +; GCN-O0-NEXT: buffer_load_ubyte v4, v2, s[96:99], 0 offen offset:2 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s0 +; GCN-O0-NEXT: buffer_load_ubyte v5, v2, s[96:99], 0 offen offset:4 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s0 +; GCN-O0-NEXT: buffer_load_ubyte v2, v2, s[96:99], 0 offen offset:8 +; GCN-O0-NEXT: s_waitcnt vmcnt(1) +; GCN-O0-NEXT: v_and_b32_e64 v5, 1, v5 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[0:1], v5, 1 +; GCN-O0-NEXT: v_writelane_b32 v11, s0, 37 +; GCN-O0-NEXT: v_writelane_b32 v11, s1, 38 +; GCN-O0-NEXT: v_readlane_b32 s1, v11, 3 +; GCN-O0-NEXT: v_readlane_b32 s0, v11, 2 +; GCN-O0-NEXT: v_mov_b32_e32 v5, s78 +; GCN-O0-NEXT: buffer_load_ubyte v5, v5, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v5, 1, v5 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[78:79], v5, 1 +; GCN-O0-NEXT: v_writelane_b32 v11, s78, 39 +; GCN-O0-NEXT: v_writelane_b32 v11, s79, 40 +; GCN-O0-NEXT: v_and_b32_e64 v4, 1, v4 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[78:79], v4, 1 +; GCN-O0-NEXT: v_writelane_b32 v11, s78, 41 +; GCN-O0-NEXT: v_writelane_b32 v11, s79, 42 +; GCN-O0-NEXT: v_and_b32_e64 v3, 1, v3 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[78:79], v3, 1 +; GCN-O0-NEXT: v_writelane_b32 v11, s78, 43 +; GCN-O0-NEXT: v_writelane_b32 v11, s79, 44 +; GCN-O0-NEXT: v_and_b32_e64 v1, 1, v1 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[78:79], v1, 1 +; GCN-O0-NEXT: v_writelane_b32 v11, s78, 45 +; GCN-O0-NEXT: v_writelane_b32 v11, s79, 46 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 +; GCN-O0-NEXT: buffer_load_ubyte v1, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v1, 1, v1 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[78:79], v1, 1 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s0 +; GCN-O0-NEXT: buffer_load_ubyte v1, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s0 +; GCN-O0-NEXT: buffer_load_ubyte v3, v3, s[96:99], 0 offen offset:2 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v3, 1, v3 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[0:1], v3, 1 +; GCN-O0-NEXT: v_writelane_b32 v11, s0, 47 +; GCN-O0-NEXT: v_writelane_b32 v11, s1, 48 +; GCN-O0-NEXT: v_and_b32_e64 v1, 1, v1 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, 1 +; GCN-O0-NEXT: v_writelane_b32 v11, s0, 49 +; GCN-O0-NEXT: v_writelane_b32 v11, s1, 50 +; GCN-O0-NEXT: v_readlane_b32 s0, v11, 1 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s0 +; GCN-O0-NEXT: buffer_load_ubyte v1, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s0 +; GCN-O0-NEXT: buffer_load_ubyte v3, v3, s[96:99], 0 offen offset:2 +; GCN-O0-NEXT: v_mov_b32_e32 v4, s0 +; GCN-O0-NEXT: buffer_load_ubyte v4, v4, s[96:99], 0 offen offset:4 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v4, 1, v4 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[0:1], v4, 1 +; GCN-O0-NEXT: v_writelane_b32 v11, s0, 51 +; GCN-O0-NEXT: v_writelane_b32 v11, s1, 52 +; GCN-O0-NEXT: v_readlane_b32 s0, v11, 4 +; GCN-O0-NEXT: v_mov_b32_e32 v4, s0 +; GCN-O0-NEXT: buffer_load_ubyte v4, v4, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v4, 1, v4 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[0:1], v4, 1 +; GCN-O0-NEXT: v_writelane_b32 v11, s0, 53 +; GCN-O0-NEXT: v_writelane_b32 v11, s1, 54 +; GCN-O0-NEXT: v_and_b32_e64 v3, 1, v3 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[0:1], v3, 1 +; GCN-O0-NEXT: v_writelane_b32 v11, s0, 55 +; GCN-O0-NEXT: v_writelane_b32 v11, s1, 56 +; GCN-O0-NEXT: v_and_b32_e64 v2, 1, v2 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, 1 +; GCN-O0-NEXT: v_writelane_b32 v11, s0, 57 +; GCN-O0-NEXT: v_writelane_b32 v11, s1, 58 +; GCN-O0-NEXT: v_and_b32_e64 v1, 1, v1 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, 1 +; GCN-O0-NEXT: v_writelane_b32 v11, s0, 59 +; GCN-O0-NEXT: v_writelane_b32 v11, s1, 60 +; GCN-O0-NEXT: v_readlane_b32 s0, v11, 7 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s0 +; GCN-O0-NEXT: buffer_load_ubyte v1, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v1, 1, v1 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, 1 +; GCN-O0-NEXT: v_writelane_b32 v11, s0, 61 +; GCN-O0-NEXT: v_writelane_b32 v11, s1, 62 +; GCN-O0-NEXT: v_readlane_b32 s0, v11, 6 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s0 +; GCN-O0-NEXT: buffer_load_ubyte v1, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s0 +; GCN-O0-NEXT: buffer_load_ubyte v2, v2, s[96:99], 0 offen offset:2 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v2, 1, v2 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, 1 +; GCN-O0-NEXT: ; implicit-def: $vgpr8 : SGPR spill to VGPR lane +; GCN-O0-NEXT: v_writelane_b32 v11, s0, 63 +; GCN-O0-NEXT: v_writelane_b32 v8, s1, 0 +; GCN-O0-NEXT: v_and_b32_e64 v1, 1, v1 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, 1 +; GCN-O0-NEXT: v_writelane_b32 v8, s0, 1 +; GCN-O0-NEXT: v_writelane_b32 v8, s1, 2 +; GCN-O0-NEXT: v_readlane_b32 s0, v11, 5 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s0 +; GCN-O0-NEXT: buffer_load_ubyte v5, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s0 +; GCN-O0-NEXT: buffer_load_ubyte v1, v1, s[96:99], 0 offen offset:2 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s0 +; GCN-O0-NEXT: buffer_load_ubyte v6, v2, s[96:99], 0 offen offset:4 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s0 +; GCN-O0-NEXT: buffer_load_ubyte v2, v2, s[96:99], 0 offen offset:8 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s0 +; GCN-O0-NEXT: buffer_load_ubyte v3, v3, s[96:99], 0 offen offset:16 +; GCN-O0-NEXT: v_mov_b32_e32 v4, s0 +; GCN-O0-NEXT: buffer_load_ubyte v4, v4, s[96:99], 0 offen offset:32 +; GCN-O0-NEXT: s_waitcnt vmcnt(3) +; GCN-O0-NEXT: v_and_b32_e64 v6, 1, v6 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[0:1], v6, 1 +; GCN-O0-NEXT: v_writelane_b32 v8, s0, 3 +; GCN-O0-NEXT: v_writelane_b32 v8, s1, 4 +; GCN-O0-NEXT: v_readlane_b32 s0, v11, 8 +; GCN-O0-NEXT: v_mov_b32_e32 v6, s0 +; GCN-O0-NEXT: buffer_load_ubyte v6, v6, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v6, 1, v6 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[0:1], v6, 1 +; GCN-O0-NEXT: v_writelane_b32 v8, s0, 5 +; GCN-O0-NEXT: v_writelane_b32 v8, s1, 6 +; GCN-O0-NEXT: v_and_b32_e64 v5, 1, v5 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[0:1], v5, 1 +; GCN-O0-NEXT: v_writelane_b32 v8, s0, 7 +; GCN-O0-NEXT: v_writelane_b32 v8, s1, 8 +; GCN-O0-NEXT: v_and_b32_e64 v1, 1, v1 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, 1 +; GCN-O0-NEXT: v_writelane_b32 v8, s0, 9 +; GCN-O0-NEXT: v_writelane_b32 v8, s1, 10 +; GCN-O0-NEXT: v_readlane_b32 s0, v11, 11 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s0 +; GCN-O0-NEXT: buffer_load_ubyte v1, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v1, 1, v1 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, 1 +; GCN-O0-NEXT: v_writelane_b32 v8, s0, 11 +; GCN-O0-NEXT: v_writelane_b32 v8, s1, 12 +; GCN-O0-NEXT: v_readlane_b32 s0, v11, 10 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s0 +; GCN-O0-NEXT: buffer_load_ubyte v1, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v5, s0 +; GCN-O0-NEXT: buffer_load_ubyte v5, v5, s[96:99], 0 offen offset:2 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v5, 1, v5 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[0:1], v5, 1 +; GCN-O0-NEXT: v_writelane_b32 v8, s0, 13 +; GCN-O0-NEXT: v_writelane_b32 v8, s1, 14 +; GCN-O0-NEXT: v_and_b32_e64 v1, 1, v1 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, 1 +; GCN-O0-NEXT: v_writelane_b32 v8, s0, 15 +; GCN-O0-NEXT: v_writelane_b32 v8, s1, 16 +; GCN-O0-NEXT: v_readlane_b32 s0, v11, 9 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s0 +; GCN-O0-NEXT: buffer_load_ubyte v1, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v5, s0 +; GCN-O0-NEXT: buffer_load_ubyte v5, v5, s[96:99], 0 offen offset:2 +; GCN-O0-NEXT: v_mov_b32_e32 v6, s0 +; GCN-O0-NEXT: buffer_load_ubyte v6, v6, s[96:99], 0 offen offset:4 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v6, 1, v6 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[0:1], v6, 1 +; GCN-O0-NEXT: v_writelane_b32 v8, s0, 17 +; GCN-O0-NEXT: v_writelane_b32 v8, s1, 18 +; GCN-O0-NEXT: v_readlane_b32 s0, v11, 12 +; GCN-O0-NEXT: v_mov_b32_e32 v6, s0 +; GCN-O0-NEXT: buffer_load_ubyte v6, v6, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v6, 1, v6 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[0:1], v6, 1 +; GCN-O0-NEXT: v_writelane_b32 v8, s0, 19 +; GCN-O0-NEXT: v_writelane_b32 v8, s1, 20 +; GCN-O0-NEXT: v_and_b32_e64 v5, 1, v5 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[0:1], v5, 1 +; GCN-O0-NEXT: v_writelane_b32 v8, s0, 21 +; GCN-O0-NEXT: v_writelane_b32 v8, s1, 22 +; GCN-O0-NEXT: v_and_b32_e64 v2, 1, v2 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, 1 +; GCN-O0-NEXT: v_writelane_b32 v8, s0, 23 +; GCN-O0-NEXT: v_writelane_b32 v8, s1, 24 +; GCN-O0-NEXT: v_and_b32_e64 v1, 1, v1 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, 1 +; GCN-O0-NEXT: v_writelane_b32 v8, s0, 25 +; GCN-O0-NEXT: v_writelane_b32 v8, s1, 26 +; GCN-O0-NEXT: v_readlane_b32 s0, v11, 14 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s0 +; GCN-O0-NEXT: buffer_load_ubyte v1, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v1, 1, v1 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, 1 +; GCN-O0-NEXT: v_writelane_b32 v8, s0, 27 +; GCN-O0-NEXT: v_writelane_b32 v8, s1, 28 +; GCN-O0-NEXT: v_readlane_b32 s0, v11, 13 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s0 +; GCN-O0-NEXT: buffer_load_ubyte v1, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s0 +; GCN-O0-NEXT: buffer_load_ubyte v2, v2, s[96:99], 0 offen offset:2 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v2, 1, v2 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, 1 +; GCN-O0-NEXT: v_writelane_b32 v8, s0, 29 +; GCN-O0-NEXT: v_writelane_b32 v8, s1, 30 +; GCN-O0-NEXT: v_and_b32_e64 v1, 1, v1 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, 1 +; GCN-O0-NEXT: v_writelane_b32 v8, s0, 31 +; GCN-O0-NEXT: v_writelane_b32 v8, s1, 32 +; GCN-O0-NEXT: v_readlane_b32 s1, v11, 15 +; GCN-O0-NEXT: v_readlane_b32 s0, v11, 16 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s90 +; GCN-O0-NEXT: buffer_load_ubyte v1, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s90 +; GCN-O0-NEXT: buffer_load_ubyte v5, v2, s[96:99], 0 offen offset:2 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s90 +; GCN-O0-NEXT: buffer_load_ubyte v6, v2, s[96:99], 0 offen offset:4 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s90 +; GCN-O0-NEXT: buffer_load_ubyte v2, v2, s[96:99], 0 offen offset:8 +; GCN-O0-NEXT: s_waitcnt vmcnt(1) +; GCN-O0-NEXT: v_and_b32_e64 v6, 1, v6 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[90:91], v6, 1 +; GCN-O0-NEXT: v_writelane_b32 v8, s90, 33 +; GCN-O0-NEXT: v_writelane_b32 v8, s91, 34 +; GCN-O0-NEXT: v_mov_b32_e32 v6, s57 +; GCN-O0-NEXT: buffer_load_ubyte v6, v6, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v6, 1, v6 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[90:91], v6, 1 +; GCN-O0-NEXT: v_writelane_b32 v8, s90, 35 +; GCN-O0-NEXT: v_writelane_b32 v8, s91, 36 +; GCN-O0-NEXT: v_and_b32_e64 v5, 1, v5 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[90:91], v5, 1 +; GCN-O0-NEXT: v_writelane_b32 v8, s90, 37 +; GCN-O0-NEXT: v_writelane_b32 v8, s91, 38 +; GCN-O0-NEXT: v_and_b32_e64 v3, 1, v3 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[90:91], v3, 1 +; GCN-O0-NEXT: v_writelane_b32 v8, s90, 39 +; GCN-O0-NEXT: v_writelane_b32 v8, s91, 40 +; GCN-O0-NEXT: v_and_b32_e64 v1, 1, v1 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[90:91], v1, 1 +; GCN-O0-NEXT: v_writelane_b32 v8, s90, 41 +; GCN-O0-NEXT: v_writelane_b32 v8, s91, 42 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s56 +; GCN-O0-NEXT: buffer_load_ubyte v1, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v1, 1, v1 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[56:57], v1, 1 +; GCN-O0-NEXT: v_writelane_b32 v8, s56, 43 +; GCN-O0-NEXT: v_writelane_b32 v8, s57, 44 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s39 +; GCN-O0-NEXT: buffer_load_ubyte v1, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s39 +; GCN-O0-NEXT: buffer_load_ubyte v3, v3, s[96:99], 0 offen offset:2 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v3, 1, v3 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[56:57], v3, 1 +; GCN-O0-NEXT: v_writelane_b32 v8, s56, 45 +; GCN-O0-NEXT: v_writelane_b32 v8, s57, 46 +; GCN-O0-NEXT: v_and_b32_e64 v1, 1, v1 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[56:57], v1, 1 +; GCN-O0-NEXT: v_writelane_b32 v8, s56, 47 +; GCN-O0-NEXT: v_writelane_b32 v8, s57, 48 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s38 +; GCN-O0-NEXT: buffer_load_ubyte v1, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s38 +; GCN-O0-NEXT: buffer_load_ubyte v3, v3, s[96:99], 0 offen offset:2 +; GCN-O0-NEXT: v_mov_b32_e32 v5, s38 +; GCN-O0-NEXT: buffer_load_ubyte v5, v5, s[96:99], 0 offen offset:4 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v5, 1, v5 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[38:39], v5, 1 +; GCN-O0-NEXT: v_writelane_b32 v8, s38, 49 +; GCN-O0-NEXT: v_writelane_b32 v8, s39, 50 +; GCN-O0-NEXT: v_mov_b32_e32 v5, s33 +; GCN-O0-NEXT: buffer_load_ubyte v5, v5, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v5, 1, v5 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[38:39], v5, 1 +; GCN-O0-NEXT: v_writelane_b32 v8, s38, 51 +; GCN-O0-NEXT: v_writelane_b32 v8, s39, 52 +; GCN-O0-NEXT: v_and_b32_e64 v3, 1, v3 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[38:39], v3, 1 +; GCN-O0-NEXT: v_writelane_b32 v8, s38, 53 +; GCN-O0-NEXT: v_writelane_b32 v8, s39, 54 +; GCN-O0-NEXT: v_and_b32_e64 v2, 1, v2 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[38:39], v2, 1 +; GCN-O0-NEXT: v_writelane_b32 v8, s38, 55 +; GCN-O0-NEXT: v_writelane_b32 v8, s39, 56 +; GCN-O0-NEXT: v_and_b32_e64 v1, 1, v1 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[38:39], v1, 1 +; GCN-O0-NEXT: v_writelane_b32 v8, s38, 57 +; GCN-O0-NEXT: v_writelane_b32 v8, s39, 58 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s23 +; GCN-O0-NEXT: buffer_load_ubyte v1, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v1, 1, v1 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[38:39], v1, 1 +; GCN-O0-NEXT: v_writelane_b32 v8, s38, 59 +; GCN-O0-NEXT: v_writelane_b32 v8, s39, 60 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s22 +; GCN-O0-NEXT: buffer_load_ubyte v1, v1, s[96:99], 0 offen offset:2 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v1, 1, v1 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[38:39], v1, 1 +; GCN-O0-NEXT: v_writelane_b32 v8, s38, 61 +; GCN-O0-NEXT: v_writelane_b32 v8, s39, 62 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s22 +; GCN-O0-NEXT: buffer_load_ubyte v1, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v1, 1, v1 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[22:23], v1, 1 +; GCN-O0-NEXT: ; implicit-def: $vgpr9 : SGPR spill to VGPR lane +; GCN-O0-NEXT: v_writelane_b32 v8, s22, 63 +; GCN-O0-NEXT: v_writelane_b32 v9, s23, 0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s21 +; GCN-O0-NEXT: buffer_load_ubyte v3, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s21 +; GCN-O0-NEXT: buffer_load_ubyte v5, v1, s[96:99], 0 offen offset:2 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s21 +; GCN-O0-NEXT: buffer_load_ubyte v6, v1, s[96:99], 0 offen offset:4 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s21 +; GCN-O0-NEXT: buffer_load_ubyte v2, v1, s[96:99], 0 offen offset:8 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s21 +; GCN-O0-NEXT: buffer_load_ubyte v1, v1, s[96:99], 0 offen offset:16 +; GCN-O0-NEXT: s_waitcnt vmcnt(2) +; GCN-O0-NEXT: v_and_b32_e64 v6, 1, v6 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[22:23], v6, 1 +; GCN-O0-NEXT: v_writelane_b32 v9, s22, 1 +; GCN-O0-NEXT: v_writelane_b32 v9, s23, 2 +; GCN-O0-NEXT: v_mov_b32_e32 v6, s20 +; GCN-O0-NEXT: buffer_load_ubyte v6, v6, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v6, 1, v6 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[20:21], v6, 1 +; GCN-O0-NEXT: v_writelane_b32 v9, s20, 3 +; GCN-O0-NEXT: v_writelane_b32 v9, s21, 4 +; GCN-O0-NEXT: v_and_b32_e64 v5, 1, v5 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[20:21], v5, 1 +; GCN-O0-NEXT: v_writelane_b32 v9, s20, 5 +; GCN-O0-NEXT: v_writelane_b32 v9, s21, 6 +; GCN-O0-NEXT: v_and_b32_e64 v4, 1, v4 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[20:21], v4, 1 +; GCN-O0-NEXT: v_writelane_b32 v9, s20, 7 +; GCN-O0-NEXT: v_writelane_b32 v9, s21, 8 +; GCN-O0-NEXT: v_and_b32_e64 v3, 1, v3 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[20:21], v3, 1 +; GCN-O0-NEXT: v_writelane_b32 v9, s20, 9 +; GCN-O0-NEXT: v_writelane_b32 v9, s21, 10 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s19 +; GCN-O0-NEXT: buffer_load_ubyte v3, v3, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v3, 1, v3 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[20:21], v3, 1 +; GCN-O0-NEXT: v_writelane_b32 v9, s20, 11 +; GCN-O0-NEXT: v_writelane_b32 v9, s21, 12 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s18 +; GCN-O0-NEXT: buffer_load_ubyte v3, v3, s[96:99], 0 offen offset:2 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v3, 1, v3 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[20:21], v3, 1 +; GCN-O0-NEXT: v_writelane_b32 v9, s20, 13 +; GCN-O0-NEXT: v_writelane_b32 v9, s21, 14 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s18 +; GCN-O0-NEXT: buffer_load_ubyte v3, v3, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v3, 1, v3 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[18:19], v3, 1 +; GCN-O0-NEXT: v_writelane_b32 v9, s18, 15 +; GCN-O0-NEXT: v_writelane_b32 v9, s19, 16 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s14 +; GCN-O0-NEXT: buffer_load_ubyte v3, v3, s[96:99], 0 offen offset:4 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v3, 1, v3 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[18:19], v3, 1 +; GCN-O0-NEXT: v_writelane_b32 v9, s18, 17 +; GCN-O0-NEXT: v_writelane_b32 v9, s19, 18 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s15 +; GCN-O0-NEXT: buffer_load_ubyte v3, v3, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v3, 1, v3 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[18:19], v3, 1 +; GCN-O0-NEXT: v_writelane_b32 v9, s18, 19 +; GCN-O0-NEXT: v_writelane_b32 v9, s19, 20 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s14 +; GCN-O0-NEXT: buffer_load_ubyte v3, v3, s[96:99], 0 offen offset:2 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v3, 1, v3 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[18:19], v3, 1 +; GCN-O0-NEXT: v_writelane_b32 v9, s18, 21 +; GCN-O0-NEXT: v_writelane_b32 v9, s19, 22 +; GCN-O0-NEXT: v_and_b32_e64 v2, 1, v2 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[18:19], v2, 1 +; GCN-O0-NEXT: v_writelane_b32 v9, s18, 23 +; GCN-O0-NEXT: v_writelane_b32 v9, s19, 24 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s14 +; GCN-O0-NEXT: buffer_load_ubyte v2, v2, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v2, 1, v2 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[14:15], v2, 1 +; GCN-O0-NEXT: v_writelane_b32 v9, s14, 25 +; GCN-O0-NEXT: v_writelane_b32 v9, s15, 26 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s13 +; GCN-O0-NEXT: buffer_load_ubyte v2, v2, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v2, 1, v2 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[14:15], v2, 1 +; GCN-O0-NEXT: v_writelane_b32 v9, s14, 27 +; GCN-O0-NEXT: v_writelane_b32 v9, s15, 28 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s9 +; GCN-O0-NEXT: buffer_load_ubyte v2, v2, s[96:99], 0 offen offset:2 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v2, 1, v2 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[14:15], v2, 1 +; GCN-O0-NEXT: v_writelane_b32 v9, s14, 29 +; GCN-O0-NEXT: v_writelane_b32 v9, s15, 30 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s9 +; GCN-O0-NEXT: buffer_load_ubyte v2, v2, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v2, 1, v2 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[14:15], v2, 1 +; GCN-O0-NEXT: v_writelane_b32 v9, s14, 31 +; GCN-O0-NEXT: v_writelane_b32 v9, s15, 32 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s1 +; GCN-O0-NEXT: buffer_load_ubyte v2, v2, s[96:99], 0 offen offset:4 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v2, 1, v2 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[14:15], v2, 1 +; GCN-O0-NEXT: v_writelane_b32 v9, s14, 33 +; GCN-O0-NEXT: v_writelane_b32 v9, s15, 34 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s3 +; GCN-O0-NEXT: buffer_load_ubyte v2, v2, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v2, 1, v2 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[14:15], v2, 1 +; GCN-O0-NEXT: v_writelane_b32 v9, s14, 35 +; GCN-O0-NEXT: v_writelane_b32 v9, s15, 36 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s1 +; GCN-O0-NEXT: buffer_load_ubyte v2, v2, s[96:99], 0 offen offset:2 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v2, 1, v2 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[14:15], v2, 1 +; GCN-O0-NEXT: v_writelane_b32 v9, s14, 37 +; GCN-O0-NEXT: v_writelane_b32 v9, s15, 38 +; GCN-O0-NEXT: v_and_b32_e64 v1, 1, v1 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[14:15], v1, 1 +; GCN-O0-NEXT: v_writelane_b32 v9, s14, 39 +; GCN-O0-NEXT: v_writelane_b32 v9, s15, 40 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 +; GCN-O0-NEXT: buffer_load_ubyte v1, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v1, 1, v1 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[14:15], v1, 1 +; GCN-O0-NEXT: v_writelane_b32 v9, s14, 41 +; GCN-O0-NEXT: v_writelane_b32 v9, s15, 42 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s2 +; GCN-O0-NEXT: buffer_load_ubyte v1, v1, s[96:99], 0 offen +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v1, 1, v1 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[2:3], v1, 1 +; GCN-O0-NEXT: v_writelane_b32 v9, s2, 43 +; GCN-O0-NEXT: v_writelane_b32 v9, s3, 44 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s8 +; GCN-O0-NEXT: buffer_load_ubyte v1, v1, s[96:99], 0 offen offset:2 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v1, 1, v1 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[2:3], v1, 1 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s8 +; GCN-O0-NEXT: buffer_load_ubyte v1, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v1, 1, v1 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[8:9], v1, 1 +; GCN-O0-NEXT: v_writelane_b32 v9, s8, 45 +; GCN-O0-NEXT: v_writelane_b32 v9, s9, 46 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s0 +; GCN-O0-NEXT: buffer_load_ubyte v1, v1, s[96:99], 0 offen offset:4 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v1, 1, v1 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[8:9], v1, 1 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s12 +; GCN-O0-NEXT: buffer_load_ubyte v1, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v1, 1, v1 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[12:13], v1, 1 +; GCN-O0-NEXT: v_writelane_b32 v9, s12, 47 +; GCN-O0-NEXT: v_writelane_b32 v9, s13, 48 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s0 +; GCN-O0-NEXT: buffer_load_ubyte v1, v1, s[96:99], 0 offen offset:2 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v1, 1, v1 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[14:15], v1, 1 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 +; GCN-O0-NEXT: buffer_load_ubyte v1, v1, s[96:99], 0 offen offset:8 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v1, 1, v1 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[12:13], v1, 1 +; GCN-O0-NEXT: v_writelane_b32 v9, s12, 49 +; GCN-O0-NEXT: v_writelane_b32 v9, s13, 50 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s0 +; GCN-O0-NEXT: buffer_load_ubyte v1, v1, s[96:99], 0 offen offset:1 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v1, 1, v1 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, 1 +; GCN-O0-NEXT: v_writelane_b32 v9, s0, 51 +; GCN-O0-NEXT: v_writelane_b32 v9, s1, 52 +; GCN-O0-NEXT: buffer_load_ubyte v6, off, s[96:99], 0 +; GCN-O0-NEXT: buffer_load_ubyte v5, off, s[96:99], 0 offset:1 +; GCN-O0-NEXT: buffer_load_ubyte v4, off, s[96:99], 0 offset:2 +; GCN-O0-NEXT: buffer_load_ubyte v7, off, s[96:99], 0 offset:4 +; GCN-O0-NEXT: buffer_load_ubyte v3, off, s[96:99], 0 offset:8 +; GCN-O0-NEXT: buffer_load_ubyte v2, off, s[96:99], 0 offset:16 +; GCN-O0-NEXT: buffer_load_ubyte v1, off, s[96:99], 0 offset:32 +; GCN-O0-NEXT: s_waitcnt vmcnt(3) +; GCN-O0-NEXT: v_and_b32_e64 v7, 1, v7 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[12:13], v7, 1 +; GCN-O0-NEXT: v_and_b32_e64 v6, 1, v6 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[18:19], v6, 1 +; GCN-O0-NEXT: v_and_b32_e64 v5, 1, v5 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[20:21], v5, 1 +; GCN-O0-NEXT: v_and_b32_e64 v4, 1, v4 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[22:23], v4, 1 +; GCN-O0-NEXT: s_waitcnt vmcnt(2) +; GCN-O0-NEXT: v_and_b32_e64 v3, 1, v3 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[38:39], v3, 1 +; GCN-O0-NEXT: s_waitcnt vmcnt(1) +; GCN-O0-NEXT: v_and_b32_e64 v2, 1, v2 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[56:57], v2, 1 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v1, 1, v1 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[90:91], v1, 1 +; GCN-O0-NEXT: buffer_load_ubyte v1, off, s[96:99], 0 offset:64 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v1, 1, v1 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, 1 +; GCN-O0-NEXT: v_writelane_b32 v9, s0, 53 +; GCN-O0-NEXT: v_writelane_b32 v9, s1, 54 +; GCN-O0-NEXT: v_readlane_b32 s0, v9, 51 +; GCN-O0-NEXT: v_readlane_b32 s1, v9, 52 +; GCN-O0-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] +; GCN-O0-NEXT: v_readlane_b32 s0, v9, 49 +; GCN-O0-NEXT: v_readlane_b32 s1, v9, 50 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, v0, v1 +; GCN-O0-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] +; GCN-O0-NEXT: v_readlane_b32 s0, v9, 47 +; GCN-O0-NEXT: v_readlane_b32 s1, v9, 48 +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[14:15] +; GCN-O0-NEXT: s_mov_b32 s15, 2 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s15, v2 +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GCN-O0-NEXT: v_readlane_b32 s0, v9, 45 +; GCN-O0-NEXT: v_readlane_b32 s1, v9, 46 +; GCN-O0-NEXT: s_mov_b32 s14, 3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s14, v2 +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[8:9] +; GCN-O0-NEXT: s_mov_b32 s9, 4 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s9, v2 +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GCN-O0-NEXT: v_readlane_b32 s0, v9, 43 +; GCN-O0-NEXT: v_readlane_b32 s1, v9, 44 +; GCN-O0-NEXT: s_mov_b32 s8, 5 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s8, v2 +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3] ; GCN-O0-NEXT: s_mov_b32 s3, 6 ; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s3, v2 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v2 +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GCN-O0-NEXT: v_readlane_b32 s0, v12, 55 +; GCN-O0-NEXT: v_readlane_b32 s1, v12, 56 ; GCN-O0-NEXT: s_mov_b32 s2, 7 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s2, v1 -; GCN-O0-NEXT: v_or_b32_e64 v2, v0, v1 -; GCN-O0-NEXT: s_mov_b64 s[12:13], 15 -; GCN-O0-NEXT: s_mov_b32 s8, s0 -; GCN-O0-NEXT: s_mov_b32 s9, s1 -; GCN-O0-NEXT: s_mov_b32 s11, s12 -; GCN-O0-NEXT: s_mov_b32 s10, s13 -; GCN-O0-NEXT: s_add_u32 s8, s8, s11 -; GCN-O0-NEXT: s_addc_u32 s10, s9, s10 -; GCN-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9 -; GCN-O0-NEXT: s_mov_b32 s9, s10 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s8 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s9 -; GCN-O0-NEXT: flat_store_byte v[0:1], v2 -; GCN-O0-NEXT: buffer_load_dword v2, off, s[16:19], 0 offset:364 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:360 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:356 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v0, v0, v3 -; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, v3, v1 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:352 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s7, v1 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:348 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s6, v1 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:344 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s5, v1 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:340 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s4, v1 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:336 ; 4-byte Folded Reload -; GCN-O0-NEXT: v_and_b32_e64 v2, v2, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s2, v2 +; GCN-O0-NEXT: v_or_b32_e64 v3, v1, v2 +; GCN-O0-NEXT: s_mov_b32 s33, s0 +; GCN-O0-NEXT: v_writelane_b32 v9, s33, 55 +; GCN-O0-NEXT: s_mov_b32 s33, s1 +; GCN-O0-NEXT: v_readlane_b32 s0, v12, 0 +; GCN-O0-NEXT: v_readlane_b32 s1, v12, 1 +; GCN-O0-NEXT: v_writelane_b32 v9, s33, 56 +; GCN-O0-NEXT: s_mov_b32 s33, s0 +; GCN-O0-NEXT: v_writelane_b32 v9, s33, 57 +; GCN-O0-NEXT: s_mov_b32 s33, s1 +; GCN-O0-NEXT: v_readlane_b32 s1, v9, 57 +; GCN-O0-NEXT: v_readlane_b32 s0, v9, 55 +; GCN-O0-NEXT: s_add_u32 s0, s0, s1 +; GCN-O0-NEXT: v_readlane_b32 s1, v9, 56 +; GCN-O0-NEXT: s_addc_u32 s33, s1, s33 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s33 +; GCN-O0-NEXT: v_writelane_b32 v9, s0, 58 +; GCN-O0-NEXT: v_writelane_b32 v9, s1, 59 +; GCN-O0-NEXT: s_mov_b32 s33, s0 +; GCN-O0-NEXT: v_writelane_b32 v9, s33, 60 +; GCN-O0-NEXT: s_mov_b32 s33, s1 +; GCN-O0-NEXT: v_readlane_b32 s0, v12, 2 +; GCN-O0-NEXT: v_readlane_b32 s1, v12, 3 +; GCN-O0-NEXT: v_writelane_b32 v9, s33, 61 +; GCN-O0-NEXT: s_mov_b32 s33, s0 +; GCN-O0-NEXT: v_writelane_b32 v9, s33, 62 +; GCN-O0-NEXT: s_mov_b32 s33, s1 +; GCN-O0-NEXT: v_readlane_b32 s1, v9, 62 +; GCN-O0-NEXT: v_readlane_b32 s0, v9, 60 +; GCN-O0-NEXT: s_add_u32 s0, s0, s1 +; GCN-O0-NEXT: v_readlane_b32 s1, v9, 61 +; GCN-O0-NEXT: s_addc_u32 s33, s1, s33 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s33 +; GCN-O0-NEXT: ; implicit-def: $vgpr10 : SGPR spill to VGPR lane +; GCN-O0-NEXT: v_writelane_b32 v9, s0, 63 +; GCN-O0-NEXT: v_writelane_b32 v10, s1, 0 +; GCN-O0-NEXT: s_mov_b32 s33, s0 +; GCN-O0-NEXT: v_writelane_b32 v10, s33, 1 +; GCN-O0-NEXT: v_readlane_b32 s0, v10, 1 +; GCN-O0-NEXT: v_writelane_b32 v10, s1, 2 +; GCN-O0-NEXT: s_mov_b32 s1, s64 +; GCN-O0-NEXT: s_mov_b32 s33, s65 +; GCN-O0-NEXT: s_add_u32 s0, s0, s1 +; GCN-O0-NEXT: v_readlane_b32 s1, v10, 2 +; GCN-O0-NEXT: s_addc_u32 s33, s1, s33 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s33 +; GCN-O0-NEXT: v_writelane_b32 v10, s0, 3 +; GCN-O0-NEXT: v_writelane_b32 v10, s1, 4 +; GCN-O0-NEXT: s_mov_b32 s33, s0 +; GCN-O0-NEXT: v_writelane_b32 v10, s33, 5 +; GCN-O0-NEXT: v_readlane_b32 s0, v10, 5 +; GCN-O0-NEXT: v_writelane_b32 v10, s1, 6 +; GCN-O0-NEXT: s_mov_b32 s1, s28 +; GCN-O0-NEXT: s_mov_b32 s33, s29 +; GCN-O0-NEXT: s_add_u32 s0, s0, s1 +; GCN-O0-NEXT: v_readlane_b32 s1, v10, 6 +; GCN-O0-NEXT: s_addc_u32 s33, s1, s33 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s33 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s1 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s0 +; GCN-O0-NEXT: v_readlane_b32 s0, v9, 41 +; GCN-O0-NEXT: v_readlane_b32 s1, v9, 42 +; GCN-O0-NEXT: flat_store_byte v[1:2], v3 +; GCN-O0-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] +; GCN-O0-NEXT: v_readlane_b32 s0, v9, 39 +; GCN-O0-NEXT: v_readlane_b32 s1, v9, 40 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, v0, v1 +; GCN-O0-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] +; GCN-O0-NEXT: v_readlane_b32 s0, v9, 37 +; GCN-O0-NEXT: v_readlane_b32 s1, v9, 38 +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GCN-O0-NEXT: v_readlane_b32 s0, v9, 35 +; GCN-O0-NEXT: v_readlane_b32 s1, v9, 36 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s15, v2 +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GCN-O0-NEXT: v_readlane_b32 s0, v9, 33 +; GCN-O0-NEXT: v_readlane_b32 s1, v9, 34 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s14, v2 +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GCN-O0-NEXT: v_readlane_b32 s0, v9, 31 +; GCN-O0-NEXT: v_readlane_b32 s1, v9, 32 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s9, v2 +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GCN-O0-NEXT: v_readlane_b32 s0, v9, 29 +; GCN-O0-NEXT: v_readlane_b32 s1, v9, 30 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s8, v2 +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GCN-O0-NEXT: v_readlane_b32 s0, v9, 27 +; GCN-O0-NEXT: v_readlane_b32 s1, v9, 28 ; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s3, v2 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v2 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s2, v1 -; GCN-O0-NEXT: v_or_b32_e64 v2, v0, v1 -; GCN-O0-NEXT: s_mov_b64 s[12:13], 14 -; GCN-O0-NEXT: s_mov_b32 s8, s0 -; GCN-O0-NEXT: s_mov_b32 s9, s1 -; GCN-O0-NEXT: s_mov_b32 s11, s12 -; GCN-O0-NEXT: s_mov_b32 s10, s13 -; GCN-O0-NEXT: s_add_u32 s8, s8, s11 -; GCN-O0-NEXT: s_addc_u32 s10, s9, s10 -; GCN-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9 -; GCN-O0-NEXT: s_mov_b32 s9, s10 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s8 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s9 -; GCN-O0-NEXT: flat_store_byte v[0:1], v2 -; GCN-O0-NEXT: buffer_load_dword v2, off, s[16:19], 0 offset:332 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:328 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:324 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v0, v0, v3 -; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, v3, v1 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:320 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s7, v1 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:316 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s6, v1 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:312 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s5, v1 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:308 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s4, v1 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:304 ; 4-byte Folded Reload -; GCN-O0-NEXT: v_and_b32_e64 v2, v2, v3 +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GCN-O0-NEXT: v_readlane_b32 s0, v10, 3 +; GCN-O0-NEXT: v_readlane_b32 s1, v10, 4 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s2, v2 +; GCN-O0-NEXT: v_or_b32_e64 v3, v1, v2 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s1 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s0 +; GCN-O0-NEXT: v_readlane_b32 s0, v9, 25 +; GCN-O0-NEXT: v_readlane_b32 s1, v9, 26 +; GCN-O0-NEXT: flat_store_byte v[1:2], v3 +; GCN-O0-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] +; GCN-O0-NEXT: v_readlane_b32 s0, v9, 23 +; GCN-O0-NEXT: v_readlane_b32 s1, v9, 24 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, v0, v1 +; GCN-O0-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] +; GCN-O0-NEXT: v_readlane_b32 s0, v9, 21 +; GCN-O0-NEXT: v_readlane_b32 s1, v9, 22 +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GCN-O0-NEXT: v_readlane_b32 s0, v9, 19 +; GCN-O0-NEXT: v_readlane_b32 s1, v9, 20 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s15, v2 +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GCN-O0-NEXT: v_readlane_b32 s0, v9, 17 +; GCN-O0-NEXT: v_readlane_b32 s1, v9, 18 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s14, v2 +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GCN-O0-NEXT: v_readlane_b32 s0, v9, 15 +; GCN-O0-NEXT: v_readlane_b32 s1, v9, 16 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s9, v2 +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GCN-O0-NEXT: v_readlane_b32 s0, v9, 13 +; GCN-O0-NEXT: v_readlane_b32 s1, v9, 14 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s8, v2 +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GCN-O0-NEXT: v_readlane_b32 s0, v9, 11 +; GCN-O0-NEXT: v_readlane_b32 s1, v9, 12 ; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s3, v2 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v2 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s2, v1 -; GCN-O0-NEXT: v_or_b32_e64 v2, v0, v1 -; GCN-O0-NEXT: s_mov_b64 s[12:13], 13 -; GCN-O0-NEXT: s_mov_b32 s8, s0 -; GCN-O0-NEXT: s_mov_b32 s9, s1 -; GCN-O0-NEXT: s_mov_b32 s11, s12 -; GCN-O0-NEXT: s_mov_b32 s10, s13 -; GCN-O0-NEXT: s_add_u32 s8, s8, s11 -; GCN-O0-NEXT: s_addc_u32 s10, s9, s10 -; GCN-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9 -; GCN-O0-NEXT: s_mov_b32 s9, s10 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s8 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s9 -; GCN-O0-NEXT: flat_store_byte v[0:1], v2 -; GCN-O0-NEXT: buffer_load_dword v2, off, s[16:19], 0 offset:300 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:296 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:292 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v0, v0, v3 -; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, v3, v1 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:288 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s7, v1 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:284 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s6, v1 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:280 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s5, v1 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:276 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s4, v1 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:272 ; 4-byte Folded Reload -; GCN-O0-NEXT: v_and_b32_e64 v2, v2, v3 +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GCN-O0-NEXT: v_readlane_b32 s0, v9, 63 +; GCN-O0-NEXT: v_readlane_b32 s1, v10, 0 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s2, v2 +; GCN-O0-NEXT: v_or_b32_e64 v3, v1, v2 +; GCN-O0-NEXT: s_mov_b32 s33, s0 +; GCN-O0-NEXT: v_writelane_b32 v10, s33, 7 +; GCN-O0-NEXT: v_readlane_b32 s0, v10, 7 +; GCN-O0-NEXT: v_writelane_b32 v10, s1, 8 +; GCN-O0-NEXT: s_mov_b32 s1, s28 +; GCN-O0-NEXT: s_mov_b32 s33, s29 +; GCN-O0-NEXT: s_add_u32 s0, s0, s1 +; GCN-O0-NEXT: v_readlane_b32 s1, v10, 8 +; GCN-O0-NEXT: s_addc_u32 s33, s1, s33 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s33 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s1 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s0 +; GCN-O0-NEXT: v_readlane_b32 s0, v9, 9 +; GCN-O0-NEXT: v_readlane_b32 s1, v9, 10 +; GCN-O0-NEXT: flat_store_byte v[1:2], v3 +; GCN-O0-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] +; GCN-O0-NEXT: v_readlane_b32 s0, v9, 7 +; GCN-O0-NEXT: v_readlane_b32 s1, v9, 8 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, v0, v1 +; GCN-O0-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] +; GCN-O0-NEXT: v_readlane_b32 s0, v9, 5 +; GCN-O0-NEXT: v_readlane_b32 s1, v9, 6 +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GCN-O0-NEXT: v_readlane_b32 s0, v9, 3 +; GCN-O0-NEXT: v_readlane_b32 s1, v9, 4 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s15, v2 +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GCN-O0-NEXT: v_readlane_b32 s0, v9, 1 +; GCN-O0-NEXT: v_readlane_b32 s1, v9, 2 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s14, v2 +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GCN-O0-NEXT: v_readlane_b32 s0, v8, 63 +; GCN-O0-NEXT: v_readlane_b32 s1, v9, 0 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s9, v2 +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GCN-O0-NEXT: v_readlane_b32 s0, v8, 61 +; GCN-O0-NEXT: v_readlane_b32 s1, v8, 62 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s8, v2 +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GCN-O0-NEXT: v_readlane_b32 s0, v8, 59 +; GCN-O0-NEXT: v_readlane_b32 s1, v8, 60 ; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s3, v2 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v2 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s2, v1 -; GCN-O0-NEXT: v_or_b32_e64 v2, v0, v1 -; GCN-O0-NEXT: s_mov_b64 s[12:13], 12 -; GCN-O0-NEXT: s_mov_b32 s8, s0 -; GCN-O0-NEXT: s_mov_b32 s9, s1 -; GCN-O0-NEXT: s_mov_b32 s11, s12 -; GCN-O0-NEXT: s_mov_b32 s10, s13 -; GCN-O0-NEXT: s_add_u32 s8, s8, s11 -; GCN-O0-NEXT: s_addc_u32 s10, s9, s10 -; GCN-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9 -; GCN-O0-NEXT: s_mov_b32 s9, s10 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s8 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s9 -; GCN-O0-NEXT: flat_store_byte v[0:1], v2 -; GCN-O0-NEXT: buffer_load_dword v2, off, s[16:19], 0 offset:268 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:264 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:260 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v0, v0, v3 -; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, v3, v1 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:256 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s7, v1 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:252 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s6, v1 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:248 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s5, v1 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:244 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s4, v1 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:240 ; 4-byte Folded Reload -; GCN-O0-NEXT: v_and_b32_e64 v2, v2, v3 +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GCN-O0-NEXT: v_readlane_b32 s0, v9, 63 +; GCN-O0-NEXT: v_readlane_b32 s1, v10, 0 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s2, v2 +; GCN-O0-NEXT: v_or_b32_e64 v3, v1, v2 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s1 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s0 +; GCN-O0-NEXT: v_readlane_b32 s0, v8, 57 +; GCN-O0-NEXT: v_readlane_b32 s1, v8, 58 +; GCN-O0-NEXT: flat_store_byte v[1:2], v3 +; GCN-O0-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] +; GCN-O0-NEXT: v_readlane_b32 s0, v8, 55 +; GCN-O0-NEXT: v_readlane_b32 s1, v8, 56 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, v0, v1 +; GCN-O0-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] +; GCN-O0-NEXT: v_readlane_b32 s0, v8, 53 +; GCN-O0-NEXT: v_readlane_b32 s1, v8, 54 +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GCN-O0-NEXT: v_readlane_b32 s0, v8, 51 +; GCN-O0-NEXT: v_readlane_b32 s1, v8, 52 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s15, v2 +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GCN-O0-NEXT: v_readlane_b32 s0, v8, 49 +; GCN-O0-NEXT: v_readlane_b32 s1, v8, 50 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s14, v2 +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GCN-O0-NEXT: v_readlane_b32 s0, v8, 47 +; GCN-O0-NEXT: v_readlane_b32 s1, v8, 48 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s9, v2 +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GCN-O0-NEXT: v_readlane_b32 s0, v8, 45 +; GCN-O0-NEXT: v_readlane_b32 s1, v8, 46 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s8, v2 +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GCN-O0-NEXT: v_readlane_b32 s0, v8, 43 +; GCN-O0-NEXT: v_readlane_b32 s1, v8, 44 ; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s3, v2 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v2 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s2, v1 -; GCN-O0-NEXT: v_or_b32_e64 v2, v0, v1 -; GCN-O0-NEXT: s_mov_b64 s[12:13], 11 -; GCN-O0-NEXT: s_mov_b32 s8, s0 -; GCN-O0-NEXT: s_mov_b32 s9, s1 -; GCN-O0-NEXT: s_mov_b32 s11, s12 -; GCN-O0-NEXT: s_mov_b32 s10, s13 -; GCN-O0-NEXT: s_add_u32 s8, s8, s11 -; GCN-O0-NEXT: s_addc_u32 s10, s9, s10 -; GCN-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9 -; GCN-O0-NEXT: s_mov_b32 s9, s10 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s8 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s9 -; GCN-O0-NEXT: flat_store_byte v[0:1], v2 -; GCN-O0-NEXT: buffer_load_dword v2, off, s[16:19], 0 offset:236 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:232 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:228 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v0, v0, v3 -; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, v3, v1 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:224 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s7, v1 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:220 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s6, v1 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:216 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s5, v1 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:212 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s4, v1 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:208 ; 4-byte Folded Reload -; GCN-O0-NEXT: v_and_b32_e64 v2, v2, v3 +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GCN-O0-NEXT: v_readlane_b32 s0, v9, 58 +; GCN-O0-NEXT: v_readlane_b32 s1, v9, 59 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s2, v2 +; GCN-O0-NEXT: v_or_b32_e64 v3, v1, v2 +; GCN-O0-NEXT: s_mov_b32 s33, s0 +; GCN-O0-NEXT: v_writelane_b32 v10, s33, 9 +; GCN-O0-NEXT: v_readlane_b32 s0, v10, 9 +; GCN-O0-NEXT: v_writelane_b32 v10, s1, 10 +; GCN-O0-NEXT: s_mov_b32 s1, s64 +; GCN-O0-NEXT: s_mov_b32 s33, s65 +; GCN-O0-NEXT: s_add_u32 s0, s0, s1 +; GCN-O0-NEXT: v_readlane_b32 s1, v10, 10 +; GCN-O0-NEXT: s_addc_u32 s33, s1, s33 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s33 +; GCN-O0-NEXT: v_writelane_b32 v10, s0, 11 +; GCN-O0-NEXT: v_writelane_b32 v10, s1, 12 +; GCN-O0-NEXT: s_mov_b32 s33, s0 +; GCN-O0-NEXT: v_writelane_b32 v10, s33, 13 +; GCN-O0-NEXT: v_readlane_b32 s0, v10, 13 +; GCN-O0-NEXT: v_writelane_b32 v10, s1, 14 +; GCN-O0-NEXT: s_mov_b32 s1, s28 +; GCN-O0-NEXT: s_mov_b32 s33, s29 +; GCN-O0-NEXT: s_add_u32 s0, s0, s1 +; GCN-O0-NEXT: v_readlane_b32 s1, v10, 14 +; GCN-O0-NEXT: s_addc_u32 s33, s1, s33 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s33 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s1 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s0 +; GCN-O0-NEXT: v_readlane_b32 s0, v8, 41 +; GCN-O0-NEXT: v_readlane_b32 s1, v8, 42 +; GCN-O0-NEXT: flat_store_byte v[1:2], v3 +; GCN-O0-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] +; GCN-O0-NEXT: v_readlane_b32 s0, v8, 39 +; GCN-O0-NEXT: v_readlane_b32 s1, v8, 40 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, v0, v1 +; GCN-O0-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] +; GCN-O0-NEXT: v_readlane_b32 s0, v8, 37 +; GCN-O0-NEXT: v_readlane_b32 s1, v8, 38 +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GCN-O0-NEXT: v_readlane_b32 s0, v8, 35 +; GCN-O0-NEXT: v_readlane_b32 s1, v8, 36 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s15, v2 +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GCN-O0-NEXT: v_readlane_b32 s0, v8, 33 +; GCN-O0-NEXT: v_readlane_b32 s1, v8, 34 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s14, v2 +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GCN-O0-NEXT: v_readlane_b32 s0, v8, 31 +; GCN-O0-NEXT: v_readlane_b32 s1, v8, 32 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s9, v2 +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GCN-O0-NEXT: v_readlane_b32 s0, v8, 29 +; GCN-O0-NEXT: v_readlane_b32 s1, v8, 30 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s8, v2 +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GCN-O0-NEXT: v_readlane_b32 s0, v8, 27 +; GCN-O0-NEXT: v_readlane_b32 s1, v8, 28 ; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s3, v2 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v2 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s2, v1 -; GCN-O0-NEXT: v_or_b32_e64 v2, v0, v1 -; GCN-O0-NEXT: s_mov_b64 s[12:13], 10 -; GCN-O0-NEXT: s_mov_b32 s8, s0 -; GCN-O0-NEXT: s_mov_b32 s9, s1 -; GCN-O0-NEXT: s_mov_b32 s11, s12 -; GCN-O0-NEXT: s_mov_b32 s10, s13 -; GCN-O0-NEXT: s_add_u32 s8, s8, s11 -; GCN-O0-NEXT: s_addc_u32 s10, s9, s10 -; GCN-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9 -; GCN-O0-NEXT: s_mov_b32 s9, s10 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s8 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s9 -; GCN-O0-NEXT: flat_store_byte v[0:1], v2 -; GCN-O0-NEXT: buffer_load_dword v2, off, s[16:19], 0 offset:204 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:200 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:196 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v0, v0, v3 -; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, v3, v1 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:192 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s7, v1 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:188 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s6, v1 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:184 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s5, v1 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:180 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s4, v1 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:176 ; 4-byte Folded Reload -; GCN-O0-NEXT: v_and_b32_e64 v2, v2, v3 +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GCN-O0-NEXT: v_readlane_b32 s0, v10, 11 +; GCN-O0-NEXT: v_readlane_b32 s1, v10, 12 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s2, v2 +; GCN-O0-NEXT: v_or_b32_e64 v3, v1, v2 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s1 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s0 +; GCN-O0-NEXT: v_readlane_b32 s0, v8, 25 +; GCN-O0-NEXT: v_readlane_b32 s1, v8, 26 +; GCN-O0-NEXT: flat_store_byte v[1:2], v3 +; GCN-O0-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] +; GCN-O0-NEXT: v_readlane_b32 s0, v8, 23 +; GCN-O0-NEXT: v_readlane_b32 s1, v8, 24 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, v0, v1 +; GCN-O0-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] +; GCN-O0-NEXT: v_readlane_b32 s0, v8, 21 +; GCN-O0-NEXT: v_readlane_b32 s1, v8, 22 +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GCN-O0-NEXT: v_readlane_b32 s0, v8, 19 +; GCN-O0-NEXT: v_readlane_b32 s1, v8, 20 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s15, v2 +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GCN-O0-NEXT: v_readlane_b32 s0, v8, 17 +; GCN-O0-NEXT: v_readlane_b32 s1, v8, 18 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s14, v2 +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GCN-O0-NEXT: v_readlane_b32 s0, v8, 15 +; GCN-O0-NEXT: v_readlane_b32 s1, v8, 16 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s9, v2 +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GCN-O0-NEXT: v_readlane_b32 s0, v8, 13 +; GCN-O0-NEXT: v_readlane_b32 s1, v8, 14 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s8, v2 +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GCN-O0-NEXT: v_readlane_b32 s0, v8, 11 +; GCN-O0-NEXT: v_readlane_b32 s1, v8, 12 ; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s3, v2 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v2 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s2, v1 -; GCN-O0-NEXT: v_or_b32_e64 v2, v0, v1 -; GCN-O0-NEXT: s_mov_b64 s[12:13], 9 -; GCN-O0-NEXT: s_mov_b32 s8, s0 -; GCN-O0-NEXT: s_mov_b32 s9, s1 -; GCN-O0-NEXT: s_mov_b32 s11, s12 -; GCN-O0-NEXT: s_mov_b32 s10, s13 -; GCN-O0-NEXT: s_add_u32 s8, s8, s11 -; GCN-O0-NEXT: s_addc_u32 s10, s9, s10 -; GCN-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9 -; GCN-O0-NEXT: s_mov_b32 s9, s10 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s8 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s9 -; GCN-O0-NEXT: flat_store_byte v[0:1], v2 -; GCN-O0-NEXT: buffer_load_dword v2, off, s[16:19], 0 offset:172 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:168 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:164 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v0, v0, v3 -; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, v3, v1 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:160 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s7, v1 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:156 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s6, v1 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:152 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s5, v1 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:148 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s4, v1 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:144 ; 4-byte Folded Reload -; GCN-O0-NEXT: v_and_b32_e64 v2, v2, v3 +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GCN-O0-NEXT: v_readlane_b32 s0, v9, 58 +; GCN-O0-NEXT: v_readlane_b32 s1, v9, 59 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s2, v2 +; GCN-O0-NEXT: v_or_b32_e64 v3, v1, v2 +; GCN-O0-NEXT: s_mov_b32 s33, s0 +; GCN-O0-NEXT: v_writelane_b32 v10, s33, 15 +; GCN-O0-NEXT: v_readlane_b32 s0, v10, 15 +; GCN-O0-NEXT: v_writelane_b32 v10, s1, 16 +; GCN-O0-NEXT: s_mov_b32 s1, s28 +; GCN-O0-NEXT: s_mov_b32 s33, s29 +; GCN-O0-NEXT: s_add_u32 s0, s0, s1 +; GCN-O0-NEXT: v_readlane_b32 s1, v10, 16 +; GCN-O0-NEXT: s_addc_u32 s33, s1, s33 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s33 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s1 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s0 +; GCN-O0-NEXT: v_readlane_b32 s0, v8, 9 +; GCN-O0-NEXT: v_readlane_b32 s1, v8, 10 +; GCN-O0-NEXT: flat_store_byte v[1:2], v3 +; GCN-O0-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] +; GCN-O0-NEXT: v_readlane_b32 s0, v8, 7 +; GCN-O0-NEXT: v_readlane_b32 s1, v8, 8 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s15, v1 +; GCN-O0-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] +; GCN-O0-NEXT: v_readlane_b32 s0, v9, 53 +; GCN-O0-NEXT: v_readlane_b32 s1, v9, 54 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v3, v0, v1 +; GCN-O0-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] +; GCN-O0-NEXT: v_readlane_b32 s0, v8, 5 +; GCN-O0-NEXT: v_readlane_b32 s1, v8, 6 +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v3 +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GCN-O0-NEXT: v_readlane_b32 s0, v8, 3 +; GCN-O0-NEXT: v_readlane_b32 s1, v8, 4 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s14, v2 +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GCN-O0-NEXT: v_readlane_b32 s0, v8, 1 +; GCN-O0-NEXT: v_readlane_b32 s1, v8, 2 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s9, v2 +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GCN-O0-NEXT: v_readlane_b32 s0, v11, 63 +; GCN-O0-NEXT: v_readlane_b32 s1, v8, 0 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s8, v2 +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GCN-O0-NEXT: v_readlane_b32 s0, v11, 61 +; GCN-O0-NEXT: v_readlane_b32 s1, v11, 62 ; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s3, v2 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v2 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s2, v1 -; GCN-O0-NEXT: v_or_b32_e64 v2, v0, v1 -; GCN-O0-NEXT: s_mov_b64 s[12:13], 8 -; GCN-O0-NEXT: s_mov_b32 s8, s0 -; GCN-O0-NEXT: s_mov_b32 s9, s1 -; GCN-O0-NEXT: s_mov_b32 s11, s12 -; GCN-O0-NEXT: s_mov_b32 s10, s13 -; GCN-O0-NEXT: s_add_u32 s8, s8, s11 -; GCN-O0-NEXT: s_addc_u32 s10, s9, s10 -; GCN-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9 -; GCN-O0-NEXT: s_mov_b32 s9, s10 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s8 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s9 -; GCN-O0-NEXT: flat_store_byte v[0:1], v2 -; GCN-O0-NEXT: buffer_load_dword v2, off, s[16:19], 0 offset:140 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:136 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:132 ; 4-byte Folded Reload -; GCN-O0-NEXT: v_and_b32_e64 v57, v57, v3 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v0, v0, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v0, v3, v0 -; GCN-O0-NEXT: v_or_b32_e64 v57, v57, v0 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:128 ; 4-byte Folded Reload -; GCN-O0-NEXT: v_and_b32_e64 v63, v63, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v63, s7, v63 -; GCN-O0-NEXT: v_or_b32_e64 v57, v57, v63 -; GCN-O0-NEXT: v_and_b32_e64 v62, v62, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v62, s6, v62 -; GCN-O0-NEXT: v_or_b32_e64 v57, v57, v62 -; GCN-O0-NEXT: v_and_b32_e64 v61, v61, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v61, s5, v61 -; GCN-O0-NEXT: v_or_b32_e64 v57, v57, v61 -; GCN-O0-NEXT: v_and_b32_e64 v60, v60, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v60, s4, v60 -; GCN-O0-NEXT: v_or_b32_e64 v57, v57, v60 -; GCN-O0-NEXT: v_and_b32_e64 v59, v59, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v59, s3, v59 -; GCN-O0-NEXT: v_or_b32_e64 v57, v57, v59 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v58, s2, v58 -; GCN-O0-NEXT: v_or_b32_e64 v59, v57, v58 -; GCN-O0-NEXT: s_mov_b64 s[12:13], 7 -; GCN-O0-NEXT: s_mov_b32 s8, s0 -; GCN-O0-NEXT: s_mov_b32 s9, s1 -; GCN-O0-NEXT: s_mov_b32 s11, s12 -; GCN-O0-NEXT: s_mov_b32 s10, s13 -; GCN-O0-NEXT: s_add_u32 s8, s8, s11 -; GCN-O0-NEXT: s_addc_u32 s10, s9, s10 -; GCN-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9 -; GCN-O0-NEXT: s_mov_b32 s9, s10 -; GCN-O0-NEXT: v_mov_b32_e32 v58, s9 -; GCN-O0-NEXT: v_mov_b32_e32 v57, s8 -; GCN-O0-NEXT: flat_store_byte v[57:58], v59 -; GCN-O0-NEXT: v_and_b32_e64 v49, v49, v3 -; GCN-O0-NEXT: v_and_b32_e64 v56, v56, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v56, v3, v56 -; GCN-O0-NEXT: v_or_b32_e64 v49, v49, v56 -; GCN-O0-NEXT: v_and_b32_e64 v55, v55, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v55, s7, v55 -; GCN-O0-NEXT: v_or_b32_e64 v49, v49, v55 -; GCN-O0-NEXT: v_and_b32_e64 v54, v54, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v54, s6, v54 -; GCN-O0-NEXT: v_or_b32_e64 v49, v49, v54 -; GCN-O0-NEXT: v_and_b32_e64 v53, v53, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v53, s5, v53 -; GCN-O0-NEXT: v_or_b32_e64 v49, v49, v53 -; GCN-O0-NEXT: v_and_b32_e64 v52, v52, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v52, s4, v52 -; GCN-O0-NEXT: v_or_b32_e64 v49, v49, v52 -; GCN-O0-NEXT: v_and_b32_e64 v51, v51, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v51, s3, v51 -; GCN-O0-NEXT: v_or_b32_e64 v49, v49, v51 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v50, s2, v50 -; GCN-O0-NEXT: v_or_b32_e64 v51, v49, v50 -; GCN-O0-NEXT: s_mov_b64 s[12:13], 6 -; GCN-O0-NEXT: s_mov_b32 s8, s0 -; GCN-O0-NEXT: s_mov_b32 s9, s1 -; GCN-O0-NEXT: s_mov_b32 s11, s12 -; GCN-O0-NEXT: s_mov_b32 s10, s13 -; GCN-O0-NEXT: s_add_u32 s8, s8, s11 -; GCN-O0-NEXT: s_addc_u32 s10, s9, s10 -; GCN-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9 -; GCN-O0-NEXT: s_mov_b32 s9, s10 -; GCN-O0-NEXT: v_mov_b32_e32 v50, s9 -; GCN-O0-NEXT: v_mov_b32_e32 v49, s8 -; GCN-O0-NEXT: flat_store_byte v[49:50], v51 -; GCN-O0-NEXT: v_and_b32_e64 v41, v41, v3 -; GCN-O0-NEXT: v_and_b32_e64 v48, v48, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v48, v3, v48 -; GCN-O0-NEXT: v_or_b32_e64 v41, v41, v48 -; GCN-O0-NEXT: v_and_b32_e64 v47, v47, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v47, s7, v47 -; GCN-O0-NEXT: v_or_b32_e64 v41, v41, v47 -; GCN-O0-NEXT: v_and_b32_e64 v46, v46, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v46, s6, v46 -; GCN-O0-NEXT: v_or_b32_e64 v41, v41, v46 -; GCN-O0-NEXT: v_and_b32_e64 v45, v45, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v45, s5, v45 -; GCN-O0-NEXT: v_or_b32_e64 v41, v41, v45 -; GCN-O0-NEXT: v_and_b32_e64 v44, v44, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v44, s4, v44 -; GCN-O0-NEXT: v_or_b32_e64 v41, v41, v44 -; GCN-O0-NEXT: v_and_b32_e64 v43, v43, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v43, s3, v43 -; GCN-O0-NEXT: v_or_b32_e64 v41, v41, v43 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v42, s2, v42 -; GCN-O0-NEXT: v_or_b32_e64 v43, v41, v42 -; GCN-O0-NEXT: s_mov_b64 s[12:13], 5 -; GCN-O0-NEXT: s_mov_b32 s8, s0 -; GCN-O0-NEXT: s_mov_b32 s9, s1 -; GCN-O0-NEXT: s_mov_b32 s11, s12 -; GCN-O0-NEXT: s_mov_b32 s10, s13 -; GCN-O0-NEXT: s_add_u32 s8, s8, s11 -; GCN-O0-NEXT: s_addc_u32 s10, s9, s10 -; GCN-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9 -; GCN-O0-NEXT: s_mov_b32 s9, s10 -; GCN-O0-NEXT: v_mov_b32_e32 v42, s9 -; GCN-O0-NEXT: v_mov_b32_e32 v41, s8 -; GCN-O0-NEXT: flat_store_byte v[41:42], v43 -; GCN-O0-NEXT: v_and_b32_e64 v33, v33, v3 -; GCN-O0-NEXT: v_and_b32_e64 v40, v40, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v40, v3, v40 -; GCN-O0-NEXT: v_or_b32_e64 v33, v33, v40 -; GCN-O0-NEXT: v_and_b32_e64 v39, v39, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v39, s7, v39 -; GCN-O0-NEXT: v_or_b32_e64 v33, v33, v39 -; GCN-O0-NEXT: v_and_b32_e64 v38, v38, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v38, s6, v38 -; GCN-O0-NEXT: v_or_b32_e64 v33, v33, v38 -; GCN-O0-NEXT: v_and_b32_e64 v37, v37, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v37, s5, v37 -; GCN-O0-NEXT: v_or_b32_e64 v33, v33, v37 -; GCN-O0-NEXT: v_and_b32_e64 v36, v36, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v36, s4, v36 -; GCN-O0-NEXT: v_or_b32_e64 v33, v33, v36 -; GCN-O0-NEXT: v_and_b32_e64 v35, v35, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v35, s3, v35 -; GCN-O0-NEXT: v_or_b32_e64 v33, v33, v35 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v34, s2, v34 -; GCN-O0-NEXT: v_or_b32_e64 v35, v33, v34 -; GCN-O0-NEXT: s_mov_b64 s[12:13], 4 -; GCN-O0-NEXT: s_mov_b32 s8, s0 -; GCN-O0-NEXT: s_mov_b32 s9, s1 -; GCN-O0-NEXT: s_mov_b32 s11, s12 -; GCN-O0-NEXT: s_mov_b32 s10, s13 -; GCN-O0-NEXT: s_add_u32 s8, s8, s11 -; GCN-O0-NEXT: s_addc_u32 s10, s9, s10 -; GCN-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9 -; GCN-O0-NEXT: s_mov_b32 s9, s10 -; GCN-O0-NEXT: v_mov_b32_e32 v34, s9 -; GCN-O0-NEXT: v_mov_b32_e32 v33, s8 -; GCN-O0-NEXT: flat_store_byte v[33:34], v35 -; GCN-O0-NEXT: v_and_b32_e64 v25, v25, v3 -; GCN-O0-NEXT: v_and_b32_e64 v32, v32, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v32, v3, v32 -; GCN-O0-NEXT: v_or_b32_e64 v25, v25, v32 -; GCN-O0-NEXT: v_and_b32_e64 v31, v31, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v31, s7, v31 -; GCN-O0-NEXT: v_or_b32_e64 v25, v25, v31 -; GCN-O0-NEXT: v_and_b32_e64 v30, v30, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v30, s6, v30 -; GCN-O0-NEXT: v_or_b32_e64 v25, v25, v30 -; GCN-O0-NEXT: v_and_b32_e64 v29, v29, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v29, s5, v29 -; GCN-O0-NEXT: v_or_b32_e64 v25, v25, v29 -; GCN-O0-NEXT: v_and_b32_e64 v28, v28, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v28, s4, v28 -; GCN-O0-NEXT: v_or_b32_e64 v25, v25, v28 -; GCN-O0-NEXT: v_and_b32_e64 v27, v27, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v27, s3, v27 -; GCN-O0-NEXT: v_or_b32_e64 v25, v25, v27 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v26, s2, v26 -; GCN-O0-NEXT: v_or_b32_e64 v27, v25, v26 -; GCN-O0-NEXT: s_mov_b64 s[12:13], 3 -; GCN-O0-NEXT: s_mov_b32 s8, s0 -; GCN-O0-NEXT: s_mov_b32 s9, s1 -; GCN-O0-NEXT: s_mov_b32 s11, s12 -; GCN-O0-NEXT: s_mov_b32 s10, s13 -; GCN-O0-NEXT: s_add_u32 s8, s8, s11 -; GCN-O0-NEXT: s_addc_u32 s10, s9, s10 -; GCN-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9 -; GCN-O0-NEXT: s_mov_b32 s9, s10 -; GCN-O0-NEXT: v_mov_b32_e32 v26, s9 -; GCN-O0-NEXT: v_mov_b32_e32 v25, s8 -; GCN-O0-NEXT: flat_store_byte v[25:26], v27 -; GCN-O0-NEXT: v_and_b32_e64 v17, v17, v3 -; GCN-O0-NEXT: v_and_b32_e64 v24, v24, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v24, v3, v24 -; GCN-O0-NEXT: v_or_b32_e64 v17, v17, v24 -; GCN-O0-NEXT: v_and_b32_e64 v23, v23, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v23, s7, v23 -; GCN-O0-NEXT: v_or_b32_e64 v17, v17, v23 -; GCN-O0-NEXT: v_and_b32_e64 v22, v22, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v22, s6, v22 -; GCN-O0-NEXT: v_or_b32_e64 v17, v17, v22 -; GCN-O0-NEXT: v_and_b32_e64 v21, v21, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v21, s5, v21 -; GCN-O0-NEXT: v_or_b32_e64 v17, v17, v21 -; GCN-O0-NEXT: v_and_b32_e64 v20, v20, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v20, s4, v20 -; GCN-O0-NEXT: v_or_b32_e64 v17, v17, v20 -; GCN-O0-NEXT: v_and_b32_e64 v19, v19, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v19, s3, v19 -; GCN-O0-NEXT: v_or_b32_e64 v17, v17, v19 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v18, s2, v18 -; GCN-O0-NEXT: v_or_b32_e64 v19, v17, v18 -; GCN-O0-NEXT: s_mov_b64 s[12:13], 2 -; GCN-O0-NEXT: s_mov_b32 s8, s0 -; GCN-O0-NEXT: s_mov_b32 s9, s1 -; GCN-O0-NEXT: s_mov_b32 s11, s12 -; GCN-O0-NEXT: s_mov_b32 s10, s13 -; GCN-O0-NEXT: s_add_u32 s8, s8, s11 -; GCN-O0-NEXT: s_addc_u32 s10, s9, s10 -; GCN-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9 -; GCN-O0-NEXT: s_mov_b32 s9, s10 -; GCN-O0-NEXT: v_mov_b32_e32 v18, s9 -; GCN-O0-NEXT: v_mov_b32_e32 v17, s8 -; GCN-O0-NEXT: flat_store_byte v[17:18], v19 -; GCN-O0-NEXT: v_and_b32_e64 v9, v9, v3 -; GCN-O0-NEXT: v_and_b32_e64 v16, v16, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v16, v3, v16 -; GCN-O0-NEXT: v_or_b32_e64 v9, v9, v16 -; GCN-O0-NEXT: v_and_b32_e64 v15, v15, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v15, s7, v15 -; GCN-O0-NEXT: v_or_b32_e64 v9, v9, v15 -; GCN-O0-NEXT: v_and_b32_e64 v14, v14, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v14, s6, v14 -; GCN-O0-NEXT: v_or_b32_e64 v9, v9, v14 -; GCN-O0-NEXT: v_and_b32_e64 v13, v13, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v13, s5, v13 -; GCN-O0-NEXT: v_or_b32_e64 v9, v9, v13 -; GCN-O0-NEXT: v_and_b32_e64 v12, v12, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v12, s4, v12 -; GCN-O0-NEXT: v_or_b32_e64 v9, v9, v12 -; GCN-O0-NEXT: v_and_b32_e64 v11, v11, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v11, s3, v11 -; GCN-O0-NEXT: v_or_b32_e64 v9, v9, v11 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v10, s2, v10 -; GCN-O0-NEXT: v_or_b32_e64 v11, v9, v10 -; GCN-O0-NEXT: s_mov_b64 s[12:13], 1 -; GCN-O0-NEXT: s_mov_b32 s8, s0 -; GCN-O0-NEXT: s_mov_b32 s9, s1 -; GCN-O0-NEXT: s_mov_b32 s11, s12 -; GCN-O0-NEXT: s_mov_b32 s10, s13 -; GCN-O0-NEXT: s_add_u32 s8, s8, s11 -; GCN-O0-NEXT: s_addc_u32 s10, s9, s10 -; GCN-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9 -; GCN-O0-NEXT: s_mov_b32 s9, s10 -; GCN-O0-NEXT: v_mov_b32_e32 v10, s9 -; GCN-O0-NEXT: v_mov_b32_e32 v9, s8 -; GCN-O0-NEXT: flat_store_byte v[9:10], v11 -; GCN-O0-NEXT: s_waitcnt vmcnt(7) -; GCN-O0-NEXT: v_and_b32_e64 v0, v0, v3 -; GCN-O0-NEXT: v_and_b32_e64 v8, v8, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v8, v3, v8 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v8 -; GCN-O0-NEXT: v_and_b32_e64 v7, v7, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v7, s7, v7 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v7 -; GCN-O0-NEXT: v_and_b32_e64 v6, v6, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v6, s6, v6 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v6 -; GCN-O0-NEXT: v_and_b32_e64 v5, v5, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v5, s5, v5 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v5 -; GCN-O0-NEXT: v_and_b32_e64 v4, v4, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v4, s4, v4 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v4 -; GCN-O0-NEXT: v_and_b32_e64 v2, v2, v3 +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GCN-O0-NEXT: v_readlane_b32 s0, v9, 58 +; GCN-O0-NEXT: v_readlane_b32 s1, v9, 59 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s2, v2 +; GCN-O0-NEXT: v_or_b32_e64 v3, v1, v2 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s1 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s0 +; GCN-O0-NEXT: v_readlane_b32 s0, v11, 59 +; GCN-O0-NEXT: v_readlane_b32 s1, v11, 60 +; GCN-O0-NEXT: flat_store_byte v[1:2], v3 +; GCN-O0-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] +; GCN-O0-NEXT: v_readlane_b32 s0, v11, 57 +; GCN-O0-NEXT: v_readlane_b32 s1, v11, 58 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, v0, v1 +; GCN-O0-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] +; GCN-O0-NEXT: v_readlane_b32 s0, v11, 55 +; GCN-O0-NEXT: v_readlane_b32 s1, v11, 56 +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GCN-O0-NEXT: v_readlane_b32 s0, v11, 53 +; GCN-O0-NEXT: v_readlane_b32 s1, v11, 54 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s15, v2 +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GCN-O0-NEXT: v_readlane_b32 s0, v11, 51 +; GCN-O0-NEXT: v_readlane_b32 s1, v11, 52 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s14, v2 +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GCN-O0-NEXT: v_readlane_b32 s0, v11, 49 +; GCN-O0-NEXT: v_readlane_b32 s1, v11, 50 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s9, v2 +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GCN-O0-NEXT: v_readlane_b32 s0, v11, 47 +; GCN-O0-NEXT: v_readlane_b32 s1, v11, 48 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s8, v2 +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GCN-O0-NEXT: v_readlane_b32 s0, v12, 2 +; GCN-O0-NEXT: v_readlane_b32 s1, v12, 3 ; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s3, v2 +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[78:79] +; GCN-O0-NEXT: v_readlane_b32 s78, v12, 55 +; GCN-O0-NEXT: v_readlane_b32 s79, v12, 56 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s2, v2 +; GCN-O0-NEXT: v_or_b32_e64 v3, v1, v2 +; GCN-O0-NEXT: s_mov_b32 s33, s78 +; GCN-O0-NEXT: s_mov_b32 s78, s79 +; GCN-O0-NEXT: v_writelane_b32 v10, s78, 17 +; GCN-O0-NEXT: s_mov_b32 s78, s0 +; GCN-O0-NEXT: ; kill: def $sgpr1 killed $sgpr1 killed $sgpr0_sgpr1 +; GCN-O0-NEXT: v_readlane_b32 s0, v10, 17 +; GCN-O0-NEXT: s_add_u32 s78, s33, s78 +; GCN-O0-NEXT: s_addc_u32 s0, s0, s1 +; GCN-O0-NEXT: ; kill: def $sgpr78 killed $sgpr78 def $sgpr78_sgpr79 +; GCN-O0-NEXT: s_mov_b32 s79, s0 +; GCN-O0-NEXT: s_mov_b32 s0, s78 +; GCN-O0-NEXT: s_mov_b32 s1, s79 +; GCN-O0-NEXT: v_writelane_b32 v10, s1, 18 +; GCN-O0-NEXT: s_mov_b32 s1, s64 +; GCN-O0-NEXT: s_mov_b32 s33, s65 +; GCN-O0-NEXT: s_add_u32 s0, s0, s1 +; GCN-O0-NEXT: v_readlane_b32 s1, v10, 18 +; GCN-O0-NEXT: s_addc_u32 s33, s1, s33 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s33 +; GCN-O0-NEXT: v_writelane_b32 v10, s0, 19 +; GCN-O0-NEXT: v_writelane_b32 v10, s1, 20 +; GCN-O0-NEXT: s_mov_b32 s33, s0 +; GCN-O0-NEXT: v_writelane_b32 v10, s33, 21 +; GCN-O0-NEXT: v_readlane_b32 s0, v10, 21 +; GCN-O0-NEXT: v_writelane_b32 v10, s1, 22 +; GCN-O0-NEXT: s_mov_b32 s1, s28 +; GCN-O0-NEXT: s_mov_b32 s33, s29 +; GCN-O0-NEXT: s_add_u32 s0, s0, s1 +; GCN-O0-NEXT: v_readlane_b32 s1, v10, 22 +; GCN-O0-NEXT: s_addc_u32 s33, s1, s33 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s33 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s1 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s0 +; GCN-O0-NEXT: v_readlane_b32 s0, v11, 45 +; GCN-O0-NEXT: v_readlane_b32 s1, v11, 46 +; GCN-O0-NEXT: flat_store_byte v[1:2], v3 +; GCN-O0-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] +; GCN-O0-NEXT: v_readlane_b32 s0, v11, 43 +; GCN-O0-NEXT: v_readlane_b32 s1, v11, 44 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, v0, v1 +; GCN-O0-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] +; GCN-O0-NEXT: v_readlane_b32 s0, v11, 41 +; GCN-O0-NEXT: v_readlane_b32 s1, v11, 42 +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GCN-O0-NEXT: v_readlane_b32 s0, v11, 39 +; GCN-O0-NEXT: v_readlane_b32 s1, v11, 40 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s15, v2 +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GCN-O0-NEXT: v_readlane_b32 s0, v11, 37 +; GCN-O0-NEXT: v_readlane_b32 s1, v11, 38 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s14, v2 +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GCN-O0-NEXT: v_readlane_b32 s0, v11, 35 +; GCN-O0-NEXT: v_readlane_b32 s1, v11, 36 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s9, v2 +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GCN-O0-NEXT: v_readlane_b32 s0, v11, 33 +; GCN-O0-NEXT: v_readlane_b32 s1, v11, 34 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s8, v2 +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GCN-O0-NEXT: v_readlane_b32 s0, v11, 31 +; GCN-O0-NEXT: v_readlane_b32 s1, v11, 32 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s3, v2 +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GCN-O0-NEXT: v_readlane_b32 s0, v10, 19 +; GCN-O0-NEXT: v_readlane_b32 s1, v10, 20 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s2, v2 +; GCN-O0-NEXT: v_or_b32_e64 v3, v1, v2 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s1 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s0 +; GCN-O0-NEXT: v_readlane_b32 s0, v11, 29 +; GCN-O0-NEXT: v_readlane_b32 s1, v11, 30 +; GCN-O0-NEXT: flat_store_byte v[1:2], v3 +; GCN-O0-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] +; GCN-O0-NEXT: v_readlane_b32 s0, v11, 27 +; GCN-O0-NEXT: v_readlane_b32 s1, v11, 28 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, v0, v1 +; GCN-O0-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] +; GCN-O0-NEXT: v_readlane_b32 s0, v11, 25 +; GCN-O0-NEXT: v_readlane_b32 s1, v11, 26 +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GCN-O0-NEXT: v_readlane_b32 s0, v11, 23 +; GCN-O0-NEXT: v_readlane_b32 s1, v11, 24 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s15, v2 +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GCN-O0-NEXT: v_readlane_b32 s0, v11, 21 +; GCN-O0-NEXT: v_readlane_b32 s1, v11, 22 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s14, v2 +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GCN-O0-NEXT: v_readlane_b32 s0, v11, 19 +; GCN-O0-NEXT: v_readlane_b32 s1, v11, 20 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s9, v2 +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GCN-O0-NEXT: v_readlane_b32 s0, v11, 17 +; GCN-O0-NEXT: v_readlane_b32 s1, v11, 18 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s8, v2 +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s3, v2 +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s2, v2 +; GCN-O0-NEXT: v_or_b32_e64 v3, v1, v2 +; GCN-O0-NEXT: s_mov_b32 vcc_lo, s78 +; GCN-O0-NEXT: s_mov_b32 s33, s79 +; GCN-O0-NEXT: s_mov_b32 s0, s28 +; GCN-O0-NEXT: s_mov_b32 vcc_hi, s29 +; GCN-O0-NEXT: s_add_u32 vcc_lo, vcc_lo, s0 +; GCN-O0-NEXT: v_readlane_b32 s0, v12, 55 +; GCN-O0-NEXT: v_readlane_b32 s1, v12, 56 +; GCN-O0-NEXT: s_addc_u32 s33, s33, vcc_hi +; GCN-O0-NEXT: ; kill: def $vcc_lo killed $vcc_lo def $vcc +; GCN-O0-NEXT: s_mov_b32 vcc_hi, s33 +; GCN-O0-NEXT: v_mov_b32_e32 v1, vcc_lo +; GCN-O0-NEXT: v_mov_b32_e32 v2, vcc_hi +; GCN-O0-NEXT: flat_store_byte v[1:2], v3 +; GCN-O0-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[94:95] +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s15, v1 +; GCN-O0-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[92:93] +; GCN-O0-NEXT: v_lshlrev_b16_e64 v3, v0, v1 +; GCN-O0-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[90:91] +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v3 +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[88:89] +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s14, v2 +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[86:87] +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s9, v2 +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[84:85] +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s8, v2 +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[82:83] +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s3, v2 +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[80:81] +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s2, v2 +; GCN-O0-NEXT: v_or_b32_e64 v3, v1, v2 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s78 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s79 +; GCN-O0-NEXT: flat_store_byte v[1:2], v3 +; GCN-O0-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[76:77] +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, v0, v1 +; GCN-O0-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[74:75] +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[72:73] +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s15, v2 +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[70:71] +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s14, v2 +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[68:69] +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s9, v2 +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[66:67] +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s8, v2 +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[62:63] +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s3, v2 +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[44:45] +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s2, v2 +; GCN-O0-NEXT: v_or_b32_e64 v3, v1, v2 +; GCN-O0-NEXT: s_mov_b32 s44, s0 +; GCN-O0-NEXT: s_mov_b32 s33, s1 +; GCN-O0-NEXT: s_mov_b32 s62, s64 +; GCN-O0-NEXT: s_mov_b32 s45, s65 +; GCN-O0-NEXT: s_add_u32 s44, s44, s62 +; GCN-O0-NEXT: s_addc_u32 s33, s33, s45 +; GCN-O0-NEXT: ; kill: def $sgpr44 killed $sgpr44 def $sgpr44_sgpr45 +; GCN-O0-NEXT: s_mov_b32 s45, s33 +; GCN-O0-NEXT: s_mov_b32 s62, s44 +; GCN-O0-NEXT: s_mov_b32 s33, s45 +; GCN-O0-NEXT: s_mov_b32 s64, s28 +; GCN-O0-NEXT: s_mov_b32 s63, s29 +; GCN-O0-NEXT: s_add_u32 s62, s62, s64 +; GCN-O0-NEXT: s_addc_u32 s33, s33, s63 +; GCN-O0-NEXT: ; kill: def $sgpr62 killed $sgpr62 def $sgpr62_sgpr63 +; GCN-O0-NEXT: s_mov_b32 s63, s33 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s62 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s63 +; GCN-O0-NEXT: flat_store_byte v[1:2], v3 +; GCN-O0-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[60:61] +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s15, v1 +; GCN-O0-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[58:59] +; GCN-O0-NEXT: v_lshlrev_b16_e64 v3, v0, v1 +; GCN-O0-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[56:57] +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v3 +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[54:55] +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s14, v2 +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[52:53] +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s9, v2 +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[50:51] +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s8, v2 +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[48:49] +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s3, v2 +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[46:47] +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s2, v2 +; GCN-O0-NEXT: v_or_b32_e64 v3, v1, v2 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s44 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s45 +; GCN-O0-NEXT: flat_store_byte v[1:2], v3 +; GCN-O0-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[42:43] +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s15, v1 +; GCN-O0-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[40:41] +; GCN-O0-NEXT: v_lshlrev_b16_e64 v3, v0, v1 +; GCN-O0-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[38:39] +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v3 +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[36:37] +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s14, v2 +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[34:35] +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s9, v2 +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[30:31] +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s8, v2 +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[26:27] +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s3, v2 +; GCN-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[24:25] +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s2, v2 +; GCN-O0-NEXT: v_or_b32_e64 v3, v1, v2 +; GCN-O0-NEXT: s_mov_b32 s24, s0 +; GCN-O0-NEXT: s_mov_b32 s25, s1 +; GCN-O0-NEXT: s_mov_b32 s27, s28 +; GCN-O0-NEXT: s_mov_b32 s26, s29 +; GCN-O0-NEXT: s_add_u32 s24, s24, s27 +; GCN-O0-NEXT: s_addc_u32 s26, s25, s26 +; GCN-O0-NEXT: ; kill: def $sgpr24 killed $sgpr24 def $sgpr24_sgpr25 +; GCN-O0-NEXT: s_mov_b32 s25, s26 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s24 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s25 +; GCN-O0-NEXT: flat_store_byte v[1:2], v3 +; GCN-O0-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[22:23] +; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s15, v1 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[20:21] +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, v0, v2 +; GCN-O0-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[18:19] ; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v2 +; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 +; GCN-O0-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[16:17] +; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s14, v1 +; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 +; GCN-O0-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[12:13] +; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s9, v1 +; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 +; GCN-O0-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[10:11] +; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s8, v1 +; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 +; GCN-O0-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[6:7] +; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s3, v1 +; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 +; GCN-O0-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] ; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s2, v1 ; GCN-O0-NEXT: v_or_b32_e64 v2, v0, v1 ; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 @@ -6077,7 +8464,7 @@ define <8 x double> @double8_inselt_vec(<8 x double> %vec, i32 %sel) { ; GCN-O0: ; %bb.0: ; %entry ; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-O0-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] ; GCN-O0-NEXT: v_mov_b32_e32 v17, v15 ; GCN-O0-NEXT: v_mov_b32_e32 v18, v14 @@ -6126,8 +8513,8 @@ define <8 x double> @double8_inselt_vec(<8 x double> %vec, i32 %sel) { ; GCN-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; GCN-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; GCN-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b32 s4, 1 -; GCN-O0-NEXT: v_lshlrev_b32_e64 v16, s4, v16 +; GCN-O0-NEXT: s_mov_b32 s4, 2 +; GCN-O0-NEXT: v_mul_lo_u32 v16, v16, s4 ; GCN-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b64 s[4:5], 1.0 ; GCN-O0-NEXT: ; implicit-def: $vgpr33 : SGPR spill to VGPR lane @@ -6260,6 +8647,11 @@ define <8 x double> @double8_inselt_vec(<8 x double> %vec, i32 %sel) { ; GCN-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; GCN-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; GCN-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b32 s6, 1 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_add_u32_e64 v16, s[6:7], v16, s6 +; GCN-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b32 s4, s5 ; GCN-O0-NEXT: v_mov_b32_e32 v16, s4 ; GCN-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill @@ -6269,22 +8661,18 @@ define <8 x double> @double8_inselt_vec(<8 x double> %vec, i32 %sel) { ; GCN-O0-NEXT: s_or_saveexec_b64 s[10:11], -1 ; GCN-O0-NEXT: buffer_store_dword v33, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b64 exec, s[10:11] -; GCN-O0-NEXT: s_waitcnt vmcnt(14) ; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_waitcnt vmcnt(14) ; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; GCN-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; GCN-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_waitcnt vmcnt(14) ; GCN-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; GCN-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; GCN-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; GCN-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_waitcnt vmcnt(14) ; GCN-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; GCN-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; GCN-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill @@ -6314,29 +8702,29 @@ define <8 x double> @double8_inselt_vec(<8 x double> %vec, i32 %sel) { ; GCN-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; GCN-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; GCN-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: v_readfirstlane_b32 s6, v17 ; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v17 ; GCN-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GCN-O0-NEXT: s_mov_b32 m0, s6 -; GCN-O0-NEXT: v_movreld_b32_e32 v1, v16 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GCN-O0-NEXT: v_movreld_b32_e32 v0, v16 +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill @@ -6370,22 +8758,22 @@ define <8 x double> @double8_inselt_vec(<8 x double> %vec, i32 %sel) { ; GCN-O0-NEXT: v_readlane_b32 s5, v33, 7 ; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] ; GCN-O0-NEXT: ; %bb.6: -; GCN-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_waitcnt vmcnt(14) ; GCN-O0-NEXT: v_mov_b32_e32 v0, v15 ; GCN-O0-NEXT: v_mov_b32_e32 v1, v16 @@ -6418,7 +8806,7 @@ define <8 x double> @double8_inselt_vec(<8 x double> %vec, i32 %sel) { ; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: v_mov_b32_e32 v15, v30 ; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-O0-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll b/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll index e8edf3918663..3b5d8bb0e156 100644 --- a/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll +++ b/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll @@ -21,11 +21,18 @@ define amdgpu_kernel void @test_kernel(i32 %val) #0 { ; CHECK-NEXT: s_mov_b64 s[16:17], s[8:9] ; CHECK-NEXT: s_load_dword s8, s[16:17], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_load_dword s8, s[16:17], 0x0 +; CHECK-NEXT: s_mov_b32 s9, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_cmp_lg_u32 s8, s9 +; CHECK-NEXT: s_cselect_b64 s[8:9], -1, 0 ; CHECK-NEXT: v_writelane_b32 v40, s8, 1 +; CHECK-NEXT: v_writelane_b32 v40, s9, 2 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def vgpr10 ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: s_add_i32 s8, s33, 0x100100 +; CHECK-NEXT: s_add_i32 s8, s33, 0x100000 +; CHECK-NEXT: s_nop 2 ; CHECK-NEXT: buffer_store_dword v10, off, s[0:3], s8 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b64 s[18:19], 8 ; CHECK-NEXT: s_mov_b32 s8, s16 @@ -55,15 +62,12 @@ define amdgpu_kernel void @test_kernel(i32 %val) #0 { ; CHECK-NEXT: v_mov_b32_e32 v0, s18 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] -; CHECK-NEXT: s_add_i32 s4, s33, 0x100100 -; CHECK-NEXT: buffer_load_dword v10, off, s[0:3], s4 ; 4-byte Folded Reload ; CHECK-NEXT: v_readlane_b32 s4, v40, 1 -; CHECK-NEXT: s_mov_b32 s5, 0 -; CHECK-NEXT: s_cmp_eq_u32 s4, s5 -; CHECK-NEXT: v_mov_b32_e32 v0, 0x4000 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], s33 offen ; 4-byte Folded Spill -; CHECK-NEXT: s_cbranch_scc1 .LBB0_2 +; CHECK-NEXT: v_readlane_b32 s5, v40, 2 +; CHECK-NEXT: s_mov_b64 s[6:7], -1 +; CHECK-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7] +; CHECK-NEXT: s_and_b64 vcc, exec, s[4:5] +; CHECK-NEXT: s_cbranch_vccnz .LBB0_2 ; CHECK-NEXT: ; %bb.1: ; %store ; CHECK-NEXT: s_add_i32 s4, s33, 0x100000 ; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s4 ; 4-byte Folded Reload diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier-fastregalloc.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier-fastregalloc.ll index cbf697fafe68..954ccb7dec4f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier-fastregalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier-fastregalloc.ll @@ -7,6 +7,7 @@ define amdgpu_kernel void @gws_barrier_offset0(i32 %val) #0 { ; MIR: bb.0 (%ir-block.0): ; MIR-NEXT: liveins: $sgpr8_sgpr9 ; MIR-NEXT: {{ $}} + ; MIR-NEXT: dead renamable $sgpr4 = S_LOAD_DWORD_IMM renamable $sgpr8_sgpr9, 0, 0 :: (dereferenceable invariant load (s32) from constant-pool, align 16, addrspace 4) ; MIR-NEXT: renamable $sgpr4 = S_LOAD_DWORD_IMM killed renamable $sgpr8_sgpr9, 0, 0 :: (dereferenceable invariant load (s32) from %ir.val.kernarg.offset, align 16, addrspace 4) ; MIR-NEXT: $m0 = S_MOV_B32 0 ; MIR-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr4, implicit $exec, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll index f17ad7f2a0a1..9c42619141ec 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll @@ -25,6 +25,12 @@ define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in1, i32 %in2) { ; GFX8-NOOPT: ; %bb.0: ; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x30 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c ; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30 ; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) @@ -118,6 +124,12 @@ define amdgpu_kernel void @dpp_test_bc(ptr addrspace(1) %out, i32 %in1, i32 %in2 ; GFX8-NOOPT: ; %bb.0: ; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x30 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c ; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30 ; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) @@ -215,21 +227,26 @@ define weak_odr amdgpu_kernel void @dpp_test1(ptr %arg) local_unnamed_addr { ; ; GFX8-NOOPT-LABEL: dpp_test1: ; GFX8-NOOPT: ; %bb.0: ; %bb +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v3, 0 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, v2 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, v3 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOOPT-NEXT: s_mov_b32 s0, 2 -; GFX8-NOOPT-NEXT: v_lshlrev_b32_e64 v3, s0, v0 +; GFX8-NOOPT-NEXT: v_lshlrev_b32_e64 v3, s0, v2 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NOOPT-NEXT: v_add_u32_e64 v3, s[4:5], v2, v3 ; GFX8-NOOPT-NEXT: s_mov_b32 m0, -1 -; GFX8-NOOPT-NEXT: ds_read_b32 v0, v3 +; GFX8-NOOPT-NEXT: ds_read_b32 v3, v3 ; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOOPT-NEXT: s_barrier -; GFX8-NOOPT-NEXT: v_add_u32_e64 v1, s[0:1], v0, v0 -; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NOOPT-NEXT: v_add_u32_e64 v3, s[4:5], v3, v3 ; GFX8-NOOPT-NEXT: s_nop 1 -; GFX8-NOOPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf -; GFX8-NOOPT-NEXT: v_add_u32_e64 v2, s[0:1], v0, v1 -; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NOOPT-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec -; GFX8-NOOPT-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NOOPT-NEXT: v_mov_b32_dpp v2, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf +; GFX8-NOOPT-NEXT: v_add_u32_e64 v2, s[4:5], v2, v3 +; GFX8-NOOPT-NEXT: v_lshlrev_b64 v[3:4], s0, v[0:1] ; GFX8-NOOPT-NEXT: s_mov_b32 s0, s2 ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NOOPT-NEXT: s_mov_b32 s2, s3 @@ -358,14 +375,19 @@ define amdgpu_kernel void @update_dppi64_test(ptr addrspace(1) %arg, i64 %in1, i ; GFX8-NOOPT-LABEL: update_dppi64_test: ; GFX8-NOOPT: ; %bb.0: ; GFX8-NOOPT-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c -; GFX8-NOOPT-NEXT: s_mov_b32 s2, 3 -; GFX8-NOOPT-NEXT: v_lshlrev_b32_e64 v1, s2, v0 -; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NOOPT-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; GFX8-NOOPT-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NOOPT-NEXT: v_ashrrev_i32_e64 v2, 31, v0 +; GFX8-NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, v2 ; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_mov_b32 s2, 3 +; GFX8-NOOPT-NEXT: v_lshlrev_b64 v[1:2], s2, v[0:1] ; GFX8-NOOPT-NEXT: s_mov_b32 s2, s4 ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, v1 ; GFX8-NOOPT-NEXT: s_mov_b32 s4, s5 @@ -484,14 +506,19 @@ define amdgpu_kernel void @update_dppf64_test(ptr addrspace(1) %arg, double %in1 ; GFX8-NOOPT-LABEL: update_dppf64_test: ; GFX8-NOOPT: ; %bb.0: ; GFX8-NOOPT-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c -; GFX8-NOOPT-NEXT: s_mov_b32 s2, 3 -; GFX8-NOOPT-NEXT: v_lshlrev_b32_e64 v1, s2, v0 -; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NOOPT-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; GFX8-NOOPT-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NOOPT-NEXT: v_ashrrev_i32_e64 v2, 31, v0 +; GFX8-NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, v2 ; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_mov_b32 s2, 3 +; GFX8-NOOPT-NEXT: v_lshlrev_b64 v[1:2], s2, v[0:1] ; GFX8-NOOPT-NEXT: s_mov_b32 s2, s4 ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, v1 ; GFX8-NOOPT-NEXT: s_mov_b32 s4, s5 @@ -610,14 +637,19 @@ define amdgpu_kernel void @update_dppv2i32_test(ptr addrspace(1) %arg, <2 x i32> ; GFX8-NOOPT-LABEL: update_dppv2i32_test: ; GFX8-NOOPT: ; %bb.0: ; GFX8-NOOPT-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c -; GFX8-NOOPT-NEXT: s_mov_b32 s2, 3 -; GFX8-NOOPT-NEXT: v_lshlrev_b32_e64 v1, s2, v0 -; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NOOPT-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; GFX8-NOOPT-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NOOPT-NEXT: v_ashrrev_i32_e64 v2, 31, v0 +; GFX8-NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, v2 ; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_mov_b32 s2, 3 +; GFX8-NOOPT-NEXT: v_lshlrev_b64 v[1:2], s2, v[0:1] ; GFX8-NOOPT-NEXT: s_mov_b32 s2, s4 ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, v1 ; GFX8-NOOPT-NEXT: s_mov_b32 s4, s5 @@ -736,14 +768,19 @@ define amdgpu_kernel void @update_dppv2f32_test(ptr addrspace(1) %arg, <2 x floa ; GFX8-NOOPT-LABEL: update_dppv2f32_test: ; GFX8-NOOPT: ; %bb.0: ; GFX8-NOOPT-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c -; GFX8-NOOPT-NEXT: s_mov_b32 s2, 3 -; GFX8-NOOPT-NEXT: v_lshlrev_b32_e64 v1, s2, v0 -; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NOOPT-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; GFX8-NOOPT-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NOOPT-NEXT: v_ashrrev_i32_e64 v2, 31, v0 +; GFX8-NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, v2 ; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_mov_b32 s2, 3 +; GFX8-NOOPT-NEXT: v_lshlrev_b64 v[1:2], s2, v[0:1] ; GFX8-NOOPT-NEXT: s_mov_b32 s2, s4 ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, v1 ; GFX8-NOOPT-NEXT: s_mov_b32 s4, s5 @@ -862,14 +899,19 @@ define amdgpu_kernel void @update_dpp_p0_test(ptr addrspace(1) %arg, ptr %in1, p ; GFX8-NOOPT-LABEL: update_dpp_p0_test: ; GFX8-NOOPT: ; %bb.0: ; GFX8-NOOPT-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c -; GFX8-NOOPT-NEXT: s_mov_b32 s2, 3 -; GFX8-NOOPT-NEXT: v_lshlrev_b32_e64 v1, s2, v0 -; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NOOPT-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; GFX8-NOOPT-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NOOPT-NEXT: v_ashrrev_i32_e64 v2, 31, v0 +; GFX8-NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, v2 ; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_mov_b32 s2, 3 +; GFX8-NOOPT-NEXT: v_lshlrev_b64 v[1:2], s2, v[0:1] ; GFX8-NOOPT-NEXT: s_mov_b32 s2, s4 ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, v1 ; GFX8-NOOPT-NEXT: s_mov_b32 s4, s5 @@ -985,6 +1027,12 @@ define amdgpu_kernel void @update_dpp_p3_test(ptr addrspace(3) %arg, ptr addrspa ; ; GFX8-NOOPT-LABEL: update_dpp_p3_test: ; GFX8-NOOPT: ; %bb.0: +; GFX8-NOOPT-NEXT: s_load_dword s0, s[4:5], 0x24 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dword s0, s[4:5], 0x28 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOOPT-NEXT: s_load_dword s1, s[4:5], 0x24 ; GFX8-NOOPT-NEXT: s_load_dword s0, s[4:5], 0x28 ; GFX8-NOOPT-NEXT: s_mov_b32 s2, 2 @@ -1092,6 +1140,12 @@ define amdgpu_kernel void @update_dpp_p5_test(ptr addrspace(5) %arg, ptr addrspa ; GFX8-NOOPT-NEXT: s_mov_b32 s91, 0xe80000 ; GFX8-NOOPT-NEXT: s_add_u32 s88, s88, s11 ; GFX8-NOOPT-NEXT: s_addc_u32 s89, s89, 0 +; GFX8-NOOPT-NEXT: s_load_dword s0, s[4:5], 0x24 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dword s0, s[4:5], 0x28 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOOPT-NEXT: s_load_dword s1, s[4:5], 0x24 ; GFX8-NOOPT-NEXT: s_load_dword s0, s[4:5], 0x28 ; GFX8-NOOPT-NEXT: s_mov_b32 s2, 2 @@ -1194,13 +1248,16 @@ define amdgpu_kernel void @update_dppi64_imm_old_test(ptr addrspace(1) %arg, i64 ; ; GFX8-NOOPT-LABEL: update_dppi64_imm_old_test: ; GFX8-NOOPT: ; %bb.0: -; GFX8-NOOPT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 -; GFX8-NOOPT-NEXT: s_mov_b32 s0, 3 -; GFX8-NOOPT-NEXT: v_lshlrev_b32_e64 v1, s0, v0 -; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NOOPT-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; GFX8-NOOPT-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GFX8-NOOPT-NEXT: v_ashrrev_i32_e64 v2, 31, v0 +; GFX8-NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, v2 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_mov_b32 s0, 3 +; GFX8-NOOPT-NEXT: v_lshlrev_b64 v[1:2], s0, v[0:1] ; GFX8-NOOPT-NEXT: s_mov_b32 s0, s2 ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, v1 ; GFX8-NOOPT-NEXT: s_mov_b32 s2, s3 @@ -1320,13 +1377,16 @@ define amdgpu_kernel void @update_dppf64_imm_old_test(ptr addrspace(1) %arg, dou ; ; GFX8-NOOPT-LABEL: update_dppf64_imm_old_test: ; GFX8-NOOPT: ; %bb.0: -; GFX8-NOOPT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 -; GFX8-NOOPT-NEXT: s_mov_b32 s0, 3 -; GFX8-NOOPT-NEXT: v_lshlrev_b32_e64 v1, s0, v0 -; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NOOPT-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; GFX8-NOOPT-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GFX8-NOOPT-NEXT: v_ashrrev_i32_e64 v2, 31, v0 +; GFX8-NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, v2 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_mov_b32 s0, 3 +; GFX8-NOOPT-NEXT: v_lshlrev_b64 v[1:2], s0, v[0:1] ; GFX8-NOOPT-NEXT: s_mov_b32 s0, s2 ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, v1 ; GFX8-NOOPT-NEXT: s_mov_b32 s2, s3 @@ -1447,6 +1507,10 @@ define amdgpu_kernel void @update_dppi64_imm_src_test(ptr addrspace(1) %out, i64 ; GFX8-NOOPT-LABEL: update_dppi64_imm_src_test: ; GFX8-NOOPT: ; %bb.0: ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2c ; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOOPT-NEXT: s_mov_b32 s8, s1 @@ -1568,6 +1632,10 @@ define amdgpu_kernel void @update_dppf64_imm_src_test(ptr addrspace(1) %out, dou ; GFX8-NOOPT-LABEL: update_dppf64_imm_src_test: ; GFX8-NOOPT: ; %bb.0: ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2c ; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOOPT-NEXT: s_mov_b32 s8, s1 @@ -1686,6 +1754,12 @@ define amdgpu_kernel void @dpp_test_f32(ptr addrspace(1) %out, float %in1, float ; GFX8-NOOPT: ; %bb.0: ; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x30 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c ; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30 ; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) @@ -1779,6 +1853,12 @@ define amdgpu_kernel void @dpp_test_f32_imm_comb1(ptr addrspace(1) %out, float % ; GFX8-NOOPT: ; %bb.0: ; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x30 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c ; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30 ; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) @@ -1872,6 +1952,12 @@ define amdgpu_kernel void @dpp_test_f32_imm_comb2(ptr addrspace(1) %out, float % ; GFX8-NOOPT: ; %bb.0: ; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x30 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c ; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30 ; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) @@ -1965,6 +2051,12 @@ define amdgpu_kernel void @dpp_test_f32_imm_comb3(ptr addrspace(1) %out, float % ; GFX8-NOOPT: ; %bb.0: ; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x30 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c ; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30 ; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) @@ -2058,6 +2150,12 @@ define amdgpu_kernel void @dpp_test_f32_imm_comb4(ptr addrspace(1) %out, float % ; GFX8-NOOPT: ; %bb.0: ; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x30 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c ; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30 ; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) @@ -2151,6 +2249,12 @@ define amdgpu_kernel void @dpp_test_f32_imm_comb5(ptr addrspace(1) %out, float % ; GFX8-NOOPT: ; %bb.0: ; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x30 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c ; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30 ; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) @@ -2244,6 +2348,12 @@ define amdgpu_kernel void @dpp_test_f32_imm_comb6(ptr addrspace(1) %out, float % ; GFX8-NOOPT: ; %bb.0: ; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x30 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c ; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30 ; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) @@ -2338,6 +2448,12 @@ define amdgpu_kernel void @dpp_test_f32_imm_comb7(ptr addrspace(1) %out, float % ; GFX8-NOOPT: ; %bb.0: ; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x30 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c ; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30 ; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) @@ -2431,6 +2547,12 @@ define amdgpu_kernel void @dpp_test_f32_imm_comb8(ptr addrspace(1) %out, float % ; GFX8-NOOPT: ; %bb.0: ; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x30 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c ; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30 ; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) @@ -2524,6 +2646,12 @@ define amdgpu_kernel void @dpp_test_v2i16(ptr addrspace(1) %out, <2 x i16> %in1, ; GFX8-NOOPT: ; %bb.0: ; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x30 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c ; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30 ; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) @@ -2617,6 +2745,12 @@ define amdgpu_kernel void @dpp_test_v2i16_imm_comb1(ptr addrspace(1) %out, <2 x ; GFX8-NOOPT: ; %bb.0: ; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x30 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c ; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30 ; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) @@ -2710,6 +2844,12 @@ define amdgpu_kernel void @dpp_test_v2i16_imm_comb2(ptr addrspace(1) %out, <2 x ; GFX8-NOOPT: ; %bb.0: ; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x30 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c ; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30 ; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) @@ -2803,6 +2943,12 @@ define amdgpu_kernel void @dpp_test_v2i16_imm_comb3(ptr addrspace(1) %out, <2 x ; GFX8-NOOPT: ; %bb.0: ; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x30 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c ; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30 ; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) @@ -2896,6 +3042,12 @@ define amdgpu_kernel void @dpp_test_v2i16_imm_comb4(ptr addrspace(1) %out, <2 x ; GFX8-NOOPT: ; %bb.0: ; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x30 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c ; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30 ; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) @@ -2989,6 +3141,12 @@ define amdgpu_kernel void @dpp_test_v2i16_imm_comb5(ptr addrspace(1) %out, <2 x ; GFX8-NOOPT: ; %bb.0: ; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x30 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c ; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30 ; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) @@ -3082,6 +3240,12 @@ define amdgpu_kernel void @dpp_test_v2i16_imm_comb6(ptr addrspace(1) %out, <2 x ; GFX8-NOOPT: ; %bb.0: ; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x30 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c ; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30 ; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) @@ -3175,6 +3339,12 @@ define amdgpu_kernel void @dpp_test_v2i16_imm_comb7(ptr addrspace(1) %out, <2 x ; GFX8-NOOPT: ; %bb.0: ; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x30 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c ; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30 ; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) @@ -3268,6 +3438,12 @@ define amdgpu_kernel void @dpp_test_v2i16_imm_comb8(ptr addrspace(1) %out, <2 x ; GFX8-NOOPT: ; %bb.0: ; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x30 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c ; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30 ; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) @@ -3361,6 +3537,12 @@ define amdgpu_kernel void @dpp_test_v2f16(ptr addrspace(1) %out, <2 x half> %in1 ; GFX8-NOOPT: ; %bb.0: ; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x30 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c ; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30 ; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) @@ -3454,6 +3636,12 @@ define amdgpu_kernel void @dpp_test_v2f16_imm_comb1(ptr addrspace(1) %out, <2 x ; GFX8-NOOPT: ; %bb.0: ; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x30 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c ; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30 ; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) @@ -3547,6 +3735,12 @@ define amdgpu_kernel void @dpp_test_v2f16_imm_comb2(ptr addrspace(1) %out, <2 x ; GFX8-NOOPT: ; %bb.0: ; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x30 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c ; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30 ; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) @@ -3640,6 +3834,12 @@ define amdgpu_kernel void @dpp_test_v2f16_imm_comb3(ptr addrspace(1) %out, <2 x ; GFX8-NOOPT: ; %bb.0: ; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x30 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c ; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30 ; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) @@ -3733,6 +3933,12 @@ define amdgpu_kernel void @dpp_test_v2f16_imm_comb4(ptr addrspace(1) %out, <2 x ; GFX8-NOOPT: ; %bb.0: ; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x30 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c ; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30 ; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) @@ -3826,6 +4032,12 @@ define amdgpu_kernel void @dpp_test_v2f16_imm_comb5(ptr addrspace(1) %out, <2 x ; GFX8-NOOPT: ; %bb.0: ; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x30 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c ; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30 ; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) @@ -3919,6 +4131,12 @@ define amdgpu_kernel void @dpp_test_v2f16_imm_comb6(ptr addrspace(1) %out, <2 x ; GFX8-NOOPT: ; %bb.0: ; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x30 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c ; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30 ; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) @@ -4012,6 +4230,12 @@ define amdgpu_kernel void @dpp_test_v2f16_imm_comb7(ptr addrspace(1) %out, <2 x ; GFX8-NOOPT: ; %bb.0: ; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x30 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c ; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30 ; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) @@ -4105,6 +4329,12 @@ define amdgpu_kernel void @dpp_test_v2f16_imm_comb8(ptr addrspace(1) %out, <2 x ; GFX8-NOOPT: ; %bb.0: ; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x30 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c ; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30 ; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.dbg.value.ll b/llvm/test/CodeGen/AMDGPU/llvm.dbg.value.ll index 4d23fb116cd0..cf08bbbad15b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.dbg.value.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.dbg.value.ll @@ -4,9 +4,9 @@ ; GCN-LABEL: {{^}}test_debug_value: ; NOOPT: .loc 1 1 42 prologue_end ; /tmp/test_debug_value.cl:1:42 -; NOOPT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 -; NOOPT-NEXT: .Ltmp -; NOOPT-NEXT: ;DEBUG_VALUE: test_debug_value:globalptr_arg <- $sgpr4_sgpr5 +; NOOPT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; NOOPT: .Ltmp +; NOOPT: ;DEBUG_VALUE: test_debug_value:globalptr_arg <- $sgpr4_sgpr5 ; GCN: flat_store_dword ; GCN: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/load-global-invariant.ll b/llvm/test/CodeGen/AMDGPU/load-global-invariant.ll index 6cdadc5bab5f..9ac1f1794b00 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-invariant.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-invariant.ll @@ -7,6 +7,9 @@ define amdgpu_kernel void @load_constant_v3i64(ptr addrspace(1) %dst, ptr addrspace(4) %src) #0 { ; CHECK-LABEL: load_constant_v3i64: ; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8 ; CHECK-NEXT: v_mov_b32_e32 v4, 0 ; CHECK-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; CHECK-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 @@ -32,7 +35,11 @@ define amdgpu_kernel void @load_constant_v3i64(ptr addrspace(1) %dst, ptr addrsp define amdgpu_kernel void @load_global_v3i64(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 { ; CHECK-LABEL: load_global_v3i64: ; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8 ; CHECK-NEXT: v_mov_b32_e32 v6, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CHECK-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x8 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -50,6 +57,9 @@ define amdgpu_kernel void @load_global_v3i64(ptr addrspace(1) %dst, ptr addrspac define amdgpu_kernel void @load_global_v3i64_invariant(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 { ; CHECK-LABEL: load_global_v3i64_invariant: ; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8 ; CHECK-NEXT: v_mov_b32_e32 v4, 0 ; CHECK-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; CHECK-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll index 34fec6062a68..0dfa4e88346d 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll @@ -19,15 +19,19 @@ define amdgpu_kernel void @flat_agent_unordered_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -37,15 +41,19 @@ define amdgpu_kernel void @flat_agent_unordered_load( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -55,29 +63,37 @@ define amdgpu_kernel void @flat_agent_unordered_load( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_agent_unordered_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -85,13 +101,17 @@ define amdgpu_kernel void @flat_agent_unordered_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -99,11 +119,15 @@ define amdgpu_kernel void @flat_agent_unordered_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -111,23 +135,31 @@ define amdgpu_kernel void @flat_agent_unordered_load( ; ; GFX942-NOTTGSPLIT-LABEL: flat_agent_unordered_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_agent_unordered_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -135,40 +167,52 @@ define amdgpu_kernel void @flat_agent_unordered_load( ; ; GFX11-WGP-LABEL: flat_agent_unordered_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_agent_unordered_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_agent_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -177,12 +221,16 @@ define amdgpu_kernel void @flat_agent_unordered_load( ; ; GFX12-CU-LABEL: flat_agent_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -192,12 +240,16 @@ define amdgpu_kernel void @flat_agent_unordered_load( ; GFX1250-LABEL: flat_agent_unordered_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { @@ -213,15 +265,19 @@ define amdgpu_kernel void @flat_agent_monotonic_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] glc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -231,15 +287,19 @@ define amdgpu_kernel void @flat_agent_monotonic_load( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc dlc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -249,29 +309,37 @@ define amdgpu_kernel void @flat_agent_monotonic_load( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] glc dlc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_agent_monotonic_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -279,13 +347,17 @@ define amdgpu_kernel void @flat_agent_monotonic_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -293,11 +365,15 @@ define amdgpu_kernel void @flat_agent_monotonic_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -305,23 +381,31 @@ define amdgpu_kernel void @flat_agent_monotonic_load( ; ; GFX942-NOTTGSPLIT-LABEL: flat_agent_monotonic_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_agent_monotonic_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -329,40 +413,52 @@ define amdgpu_kernel void @flat_agent_monotonic_load( ; ; GFX11-WGP-LABEL: flat_agent_monotonic_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_agent_monotonic_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_agent_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -371,12 +467,16 @@ define amdgpu_kernel void @flat_agent_monotonic_load( ; ; GFX12-CU-LABEL: flat_agent_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -386,12 +486,16 @@ define amdgpu_kernel void @flat_agent_monotonic_load( ; GFX1250-LABEL: flat_agent_monotonic_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { @@ -407,9 +511,12 @@ define amdgpu_kernel void @flat_agent_acquire_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] glc @@ -426,9 +533,12 @@ define amdgpu_kernel void @flat_agent_acquire_load( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc dlc @@ -446,9 +556,12 @@ define amdgpu_kernel void @flat_agent_acquire_load( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] glc dlc @@ -462,9 +575,12 @@ define amdgpu_kernel void @flat_agent_acquire_load( ; ; SKIP-CACHE-INV-LABEL: flat_agent_acquire_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] glc @@ -478,9 +594,12 @@ define amdgpu_kernel void @flat_agent_acquire_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -493,22 +612,29 @@ define amdgpu_kernel void @flat_agent_acquire_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_agent_acquire_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc1 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -519,22 +645,29 @@ define amdgpu_kernel void @flat_agent_acquire_load( ; ; GFX942-TGSPLIT-LABEL: flat_agent_acquire_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_agent_acquire_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc @@ -548,9 +681,12 @@ define amdgpu_kernel void @flat_agent_acquire_load( ; ; GFX11-CU-LABEL: flat_agent_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc @@ -564,14 +700,18 @@ define amdgpu_kernel void @flat_agent_acquire_load( ; ; GFX12-WGP-LABEL: flat_agent_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -579,14 +719,18 @@ define amdgpu_kernel void @flat_agent_acquire_load( ; ; GFX12-CU-LABEL: flat_agent_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -595,14 +739,18 @@ define amdgpu_kernel void @flat_agent_acquire_load( ; GFX1250-LABEL: flat_agent_acquire_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { @@ -618,9 +766,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -638,9 +789,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_load( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -660,9 +814,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_load( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -678,9 +835,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_load( ; ; SKIP-CACHE-INV-LABEL: flat_agent_seq_cst_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -695,9 +855,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc @@ -711,9 +874,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc @@ -725,9 +891,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_load( ; ; GFX942-NOTTGSPLIT-LABEL: flat_agent_seq_cst_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc1 @@ -739,9 +908,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_load( ; ; GFX942-TGSPLIT-LABEL: flat_agent_seq_cst_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc1 @@ -753,9 +925,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_load( ; ; GFX11-WGP-LABEL: flat_agent_seq_cst_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -771,9 +946,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_load( ; ; GFX11-CU-LABEL: flat_agent_seq_cst_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -789,9 +967,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_load( ; ; GFX12-WGP-LABEL: flat_agent_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 @@ -801,6 +982,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_load( ; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -808,9 +990,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_load( ; ; GFX12-CU-LABEL: flat_agent_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 @@ -820,6 +1005,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_load( ; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -828,16 +1014,20 @@ define amdgpu_kernel void @flat_agent_seq_cst_load( ; GFX1250-LABEL: flat_agent_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { @@ -854,6 +1044,10 @@ define amdgpu_kernel void @flat_agent_unordered_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -869,6 +1063,10 @@ define amdgpu_kernel void @flat_agent_unordered_store( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 @@ -884,6 +1082,10 @@ define amdgpu_kernel void @flat_agent_unordered_store( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 @@ -895,6 +1097,10 @@ define amdgpu_kernel void @flat_agent_unordered_store( ; SKIP-CACHE-INV-LABEL: flat_agent_unordered_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -908,6 +1114,10 @@ define amdgpu_kernel void @flat_agent_unordered_store( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -920,6 +1130,10 @@ define amdgpu_kernel void @flat_agent_unordered_store( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -930,6 +1144,10 @@ define amdgpu_kernel void @flat_agent_unordered_store( ; GFX942-NOTTGSPLIT-LABEL: flat_agent_unordered_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -940,6 +1158,10 @@ define amdgpu_kernel void @flat_agent_unordered_store( ; GFX942-TGSPLIT-LABEL: flat_agent_unordered_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -950,6 +1172,10 @@ define amdgpu_kernel void @flat_agent_unordered_store( ; GFX11-WGP-LABEL: flat_agent_unordered_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -961,6 +1187,10 @@ define amdgpu_kernel void @flat_agent_unordered_store( ; GFX11-CU-LABEL: flat_agent_unordered_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -972,6 +1202,10 @@ define amdgpu_kernel void @flat_agent_unordered_store( ; GFX12-WGP-LABEL: flat_agent_unordered_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -983,6 +1217,10 @@ define amdgpu_kernel void @flat_agent_unordered_store( ; GFX12-CU-LABEL: flat_agent_unordered_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -994,12 +1232,16 @@ define amdgpu_kernel void @flat_agent_unordered_store( ; GFX1250-LABEL: flat_agent_unordered_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { @@ -1015,6 +1257,10 @@ define amdgpu_kernel void @flat_agent_monotonic_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -1030,6 +1276,10 @@ define amdgpu_kernel void @flat_agent_monotonic_store( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 @@ -1045,6 +1295,10 @@ define amdgpu_kernel void @flat_agent_monotonic_store( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 @@ -1056,6 +1310,10 @@ define amdgpu_kernel void @flat_agent_monotonic_store( ; SKIP-CACHE-INV-LABEL: flat_agent_monotonic_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -1069,6 +1327,10 @@ define amdgpu_kernel void @flat_agent_monotonic_store( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -1081,6 +1343,10 @@ define amdgpu_kernel void @flat_agent_monotonic_store( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -1091,6 +1357,10 @@ define amdgpu_kernel void @flat_agent_monotonic_store( ; GFX942-NOTTGSPLIT-LABEL: flat_agent_monotonic_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -1101,6 +1371,10 @@ define amdgpu_kernel void @flat_agent_monotonic_store( ; GFX942-TGSPLIT-LABEL: flat_agent_monotonic_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -1111,6 +1385,10 @@ define amdgpu_kernel void @flat_agent_monotonic_store( ; GFX11-WGP-LABEL: flat_agent_monotonic_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -1122,6 +1400,10 @@ define amdgpu_kernel void @flat_agent_monotonic_store( ; GFX11-CU-LABEL: flat_agent_monotonic_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -1133,6 +1415,10 @@ define amdgpu_kernel void @flat_agent_monotonic_store( ; GFX12-WGP-LABEL: flat_agent_monotonic_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -1144,6 +1430,10 @@ define amdgpu_kernel void @flat_agent_monotonic_store( ; GFX12-CU-LABEL: flat_agent_monotonic_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -1155,12 +1445,16 @@ define amdgpu_kernel void @flat_agent_monotonic_store( ; GFX1250-LABEL: flat_agent_monotonic_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { @@ -1176,6 +1470,10 @@ define amdgpu_kernel void @flat_agent_release_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -1192,6 +1490,10 @@ define amdgpu_kernel void @flat_agent_release_store( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 @@ -1209,6 +1511,10 @@ define amdgpu_kernel void @flat_agent_release_store( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 @@ -1222,6 +1528,10 @@ define amdgpu_kernel void @flat_agent_release_store( ; SKIP-CACHE-INV-LABEL: flat_agent_release_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -1236,6 +1546,10 @@ define amdgpu_kernel void @flat_agent_release_store( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -1249,6 +1563,10 @@ define amdgpu_kernel void @flat_agent_release_store( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -1260,6 +1578,10 @@ define amdgpu_kernel void @flat_agent_release_store( ; GFX942-NOTTGSPLIT-LABEL: flat_agent_release_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -1272,6 +1594,10 @@ define amdgpu_kernel void @flat_agent_release_store( ; GFX942-TGSPLIT-LABEL: flat_agent_release_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -1284,6 +1610,10 @@ define amdgpu_kernel void @flat_agent_release_store( ; GFX11-WGP-LABEL: flat_agent_release_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -1297,6 +1627,10 @@ define amdgpu_kernel void @flat_agent_release_store( ; GFX11-CU-LABEL: flat_agent_release_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -1310,6 +1644,10 @@ define amdgpu_kernel void @flat_agent_release_store( ; GFX12-WGP-LABEL: flat_agent_release_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -1325,6 +1663,10 @@ define amdgpu_kernel void @flat_agent_release_store( ; GFX12-CU-LABEL: flat_agent_release_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -1340,10 +1682,13 @@ define amdgpu_kernel void @flat_agent_release_store( ; GFX1250-LABEL: flat_agent_release_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -1351,6 +1696,7 @@ define amdgpu_kernel void @flat_agent_release_store( ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { @@ -1366,6 +1712,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -1382,6 +1732,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_store( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 @@ -1399,6 +1753,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_store( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 @@ -1412,6 +1770,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_store( ; SKIP-CACHE-INV-LABEL: flat_agent_seq_cst_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -1426,6 +1788,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_store( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -1439,6 +1805,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_store( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -1450,6 +1820,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_store( ; GFX942-NOTTGSPLIT-LABEL: flat_agent_seq_cst_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -1462,6 +1836,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_store( ; GFX942-TGSPLIT-LABEL: flat_agent_seq_cst_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -1474,6 +1852,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_store( ; GFX11-WGP-LABEL: flat_agent_seq_cst_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -1487,6 +1869,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_store( ; GFX11-CU-LABEL: flat_agent_seq_cst_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -1500,6 +1886,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_store( ; GFX12-WGP-LABEL: flat_agent_seq_cst_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -1515,6 +1905,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_store( ; GFX12-CU-LABEL: flat_agent_seq_cst_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -1530,10 +1924,13 @@ define amdgpu_kernel void @flat_agent_seq_cst_store( ; GFX1250-LABEL: flat_agent_seq_cst_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -1541,6 +1938,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_store( ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { @@ -1555,11 +1953,15 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm @@ -1570,11 +1972,15 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm @@ -1585,22 +1991,30 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_agent_monotonic_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -1609,10 +2023,14 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -1621,74 +2039,102 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_agent_monotonic_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_agent_monotonic_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_agent_monotonic_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_agent_monotonic_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_agent_monotonic_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_agent_monotonic_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -1696,7 +2142,11 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw( ; GFX1250-LABEL: flat_agent_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -1716,11 +2166,15 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1733,11 +2187,15 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1752,11 +2210,15 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1767,11 +2229,15 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_agent_acquire_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1781,10 +2247,14 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1795,10 +2265,14 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -1807,10 +2281,14 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: flat_agent_acquire_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1819,10 +2297,14 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: flat_agent_acquire_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -1831,11 +2313,15 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw( ; ; GFX11-WGP-LABEL: flat_agent_acquire_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1846,11 +2332,15 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw( ; ; GFX11-CU-LABEL: flat_agent_acquire_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1861,11 +2351,15 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw( ; ; GFX12-WGP-LABEL: flat_agent_acquire_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 @@ -1874,11 +2368,15 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw( ; ; GFX12-CU-LABEL: flat_agent_acquire_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 @@ -1888,7 +2386,11 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw( ; GFX1250-LABEL: flat_agent_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -1911,11 +2413,15 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 @@ -1927,11 +2433,15 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1944,11 +2454,15 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1957,11 +2471,15 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_agent_release_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 @@ -1971,10 +2489,14 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 @@ -1984,10 +2506,14 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 @@ -1995,10 +2521,14 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: flat_agent_release_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2007,10 +2537,14 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: flat_agent_release_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2019,11 +2553,15 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw( ; ; GFX11-WGP-LABEL: flat_agent_release_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2032,11 +2570,15 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw( ; ; GFX11-CU-LABEL: flat_agent_release_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2045,11 +2587,15 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw( ; ; GFX12-WGP-LABEL: flat_agent_release_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 @@ -2060,11 +2606,15 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw( ; ; GFX12-CU-LABEL: flat_agent_release_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 @@ -2076,7 +2626,11 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw( ; GFX1250-LABEL: flat_agent_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2101,11 +2655,15 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 @@ -2119,11 +2677,15 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2140,11 +2702,15 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2157,11 +2723,15 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_agent_acq_rel_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 @@ -2172,10 +2742,14 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 @@ -2187,10 +2761,14 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 @@ -2200,10 +2778,14 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: flat_agent_acq_rel_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2214,10 +2796,14 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: flat_agent_acq_rel_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2228,11 +2814,15 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw( ; ; GFX11-WGP-LABEL: flat_agent_acq_rel_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2245,11 +2835,15 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw( ; ; GFX11-CU-LABEL: flat_agent_acq_rel_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2262,11 +2856,15 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw( ; ; GFX12-WGP-LABEL: flat_agent_acq_rel_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 @@ -2279,11 +2877,15 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw( ; ; GFX12-CU-LABEL: flat_agent_acq_rel_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 @@ -2297,7 +2899,11 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw( ; GFX1250-LABEL: flat_agent_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2325,11 +2931,15 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 @@ -2343,11 +2953,15 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2364,11 +2978,15 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2381,11 +2999,15 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_agent_seq_cst_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 @@ -2396,10 +3018,14 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 @@ -2411,10 +3037,14 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 @@ -2424,10 +3054,14 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: flat_agent_seq_cst_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2438,10 +3072,14 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: flat_agent_seq_cst_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2452,11 +3090,15 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw( ; ; GFX11-WGP-LABEL: flat_agent_seq_cst_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2469,11 +3111,15 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw( ; ; GFX11-CU-LABEL: flat_agent_seq_cst_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2486,11 +3132,15 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw( ; ; GFX12-WGP-LABEL: flat_agent_seq_cst_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 @@ -2503,11 +3153,15 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw( ; ; GFX12-CU-LABEL: flat_agent_seq_cst_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 @@ -2521,7 +3175,11 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw( ; GFX1250-LABEL: flat_agent_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2550,6 +3208,10 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -2570,6 +3232,10 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -2591,6 +3257,10 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -2608,6 +3278,10 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: flat_agent_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -2625,6 +3299,10 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] @@ -2641,6 +3319,10 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] @@ -2655,6 +3337,10 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: flat_agent_acquire_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -2669,6 +3355,10 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: flat_agent_acquire_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -2683,6 +3373,10 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw( ; GFX11-WGP-LABEL: flat_agent_acquire_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -2700,6 +3394,10 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw( ; GFX11-CU-LABEL: flat_agent_acquire_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2717,6 +3415,10 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw( ; GFX12-WGP-LABEL: flat_agent_acquire_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -2733,6 +3435,10 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw( ; GFX12-CU-LABEL: flat_agent_acquire_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2749,7 +3455,11 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw( ; GFX1250-LABEL: flat_agent_acquire_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2775,6 +3485,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -2796,6 +3510,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -2819,6 +3537,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -2838,6 +3560,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: flat_agent_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -2856,6 +3582,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] @@ -2873,6 +3603,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] @@ -2888,6 +3622,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: flat_agent_acq_rel_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -2904,6 +3642,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: flat_agent_acq_rel_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -2920,6 +3662,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw( ; GFX11-WGP-LABEL: flat_agent_acq_rel_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -2939,6 +3685,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw( ; GFX11-CU-LABEL: flat_agent_acq_rel_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2958,6 +3708,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw( ; GFX12-WGP-LABEL: flat_agent_acq_rel_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -2978,6 +3732,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw( ; GFX12-CU-LABEL: flat_agent_acq_rel_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2998,7 +3756,11 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw( ; GFX1250-LABEL: flat_agent_acq_rel_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -3029,6 +3791,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -3050,6 +3816,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -3073,6 +3843,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -3092,6 +3866,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: flat_agent_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -3110,6 +3888,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] @@ -3127,6 +3909,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] @@ -3142,6 +3928,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: flat_agent_seq_cst_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -3158,6 +3948,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: flat_agent_seq_cst_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -3174,6 +3968,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw( ; GFX11-WGP-LABEL: flat_agent_seq_cst_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -3193,6 +3991,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw( ; GFX11-CU-LABEL: flat_agent_seq_cst_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -3212,6 +4014,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw( ; GFX12-WGP-LABEL: flat_agent_seq_cst_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -3232,6 +4038,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw( ; GFX12-CU-LABEL: flat_agent_seq_cst_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -3252,7 +4062,11 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw( ; GFX1250-LABEL: flat_agent_seq_cst_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -3283,11 +4097,16 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -3297,6 +4116,7 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -3312,11 +4132,16 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -3326,6 +4151,7 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -3341,11 +4167,16 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -3355,6 +4186,7 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -3366,11 +4198,16 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_agent_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -3380,6 +4217,7 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -3393,6 +4231,12 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3409,6 +4253,12 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3423,6 +4273,12 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_agent_monotonic_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3437,6 +4293,12 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_agent_monotonic_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3451,6 +4313,12 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg( ; GFX11-WGP-LABEL: flat_agent_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -3466,6 +4334,12 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg( ; GFX11-CU-LABEL: flat_agent_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -3481,6 +4355,12 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -3496,6 +4376,12 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_agent_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -3511,7 +4397,13 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg( ; GFX1250-LABEL: flat_agent_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -3537,11 +4429,16 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -3551,6 +4448,7 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -3568,11 +4466,16 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -3582,6 +4485,7 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -3601,11 +4505,16 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -3615,6 +4524,7 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -3630,11 +4540,16 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_agent_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -3644,6 +4559,7 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -3658,6 +4574,12 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3676,6 +4598,12 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3692,6 +4620,12 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_agent_acquire_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3708,6 +4642,12 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_agent_acquire_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3724,6 +4664,12 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg( ; GFX11-WGP-LABEL: flat_agent_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -3743,6 +4689,12 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg( ; GFX11-CU-LABEL: flat_agent_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -3762,6 +4714,12 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -3779,6 +4737,12 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_agent_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -3796,7 +4760,13 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg( ; GFX1250-LABEL: flat_agent_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -3825,11 +4795,16 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -3839,6 +4814,7 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -3855,11 +4831,16 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -3869,6 +4850,7 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -3886,11 +4868,16 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -3900,6 +4887,7 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -3913,11 +4901,16 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_agent_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -3927,6 +4920,7 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -3941,6 +4935,12 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3958,6 +4958,12 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3973,6 +4979,12 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_agent_release_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3989,6 +5001,12 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_agent_release_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4005,6 +5023,12 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg( ; GFX11-WGP-LABEL: flat_agent_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4022,6 +5046,12 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg( ; GFX11-CU-LABEL: flat_agent_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4039,6 +5069,12 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -4058,6 +5094,12 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_agent_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -4077,7 +5119,13 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg( ; GFX1250-LABEL: flat_agent_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -4108,11 +5156,16 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -4122,6 +5175,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -4140,11 +5194,16 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -4154,6 +5213,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -4175,11 +5235,16 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -4189,6 +5254,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -4206,11 +5272,16 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_agent_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -4220,6 +5291,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -4235,6 +5307,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4254,6 +5332,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4271,6 +5355,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_agent_acq_rel_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4289,6 +5379,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_agent_acq_rel_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4307,6 +5403,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg( ; GFX11-WGP-LABEL: flat_agent_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4328,6 +5430,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg( ; GFX11-CU-LABEL: flat_agent_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4349,6 +5457,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -4370,6 +5484,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_agent_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -4391,7 +5511,13 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg( ; GFX1250-LABEL: flat_agent_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -4425,11 +5551,16 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -4439,6 +5570,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -4457,11 +5589,16 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -4471,6 +5608,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -4492,11 +5630,16 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -4506,6 +5649,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -4523,11 +5667,16 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_agent_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -4537,6 +5686,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -4552,6 +5702,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4571,6 +5727,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4588,6 +5750,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_agent_seq_cst_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4606,6 +5774,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_agent_seq_cst_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4624,6 +5798,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg( ; GFX11-WGP-LABEL: flat_agent_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4645,6 +5825,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg( ; GFX11-CU-LABEL: flat_agent_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4666,6 +5852,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -4687,6 +5879,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_agent_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -4708,7 +5906,13 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg( ; GFX1250-LABEL: flat_agent_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -4742,11 +5946,16 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -4756,6 +5965,7 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -4773,11 +5983,16 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -4787,6 +6002,7 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -4806,11 +6022,16 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -4820,6 +6041,7 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -4835,11 +6057,16 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_agent_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -4849,6 +6076,7 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -4863,6 +6091,12 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4881,6 +6115,12 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4897,6 +6137,12 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_agent_monotonic_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4913,6 +6159,12 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_agent_monotonic_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4929,6 +6181,12 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg( ; GFX11-WGP-LABEL: flat_agent_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4948,6 +6206,12 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg( ; GFX11-CU-LABEL: flat_agent_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4967,6 +6231,12 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -4984,6 +6254,12 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_agent_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -5001,7 +6277,13 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg( ; GFX1250-LABEL: flat_agent_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -5030,11 +6312,16 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -5044,6 +6331,7 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -5061,11 +6349,16 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -5075,6 +6368,7 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -5094,11 +6388,16 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -5108,6 +6407,7 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -5123,11 +6423,16 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_agent_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -5137,6 +6442,7 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -5151,6 +6457,12 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5169,6 +6481,12 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5185,6 +6503,12 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_agent_acquire_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5201,6 +6525,12 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_agent_acquire_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5217,6 +6547,12 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg( ; GFX11-WGP-LABEL: flat_agent_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5236,6 +6572,12 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg( ; GFX11-CU-LABEL: flat_agent_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5255,6 +6597,12 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -5272,6 +6620,12 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_agent_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -5289,7 +6643,13 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg( ; GFX1250-LABEL: flat_agent_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -5318,11 +6678,16 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -5332,6 +6697,7 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -5350,11 +6716,16 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -5364,6 +6735,7 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -5385,11 +6757,16 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -5399,6 +6776,7 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -5416,11 +6794,16 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_agent_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -5430,6 +6813,7 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -5445,6 +6829,12 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5464,6 +6854,12 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5481,6 +6877,12 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_agent_release_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5499,6 +6901,12 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_agent_release_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5517,6 +6925,12 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg( ; GFX11-WGP-LABEL: flat_agent_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5538,6 +6952,12 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg( ; GFX11-CU-LABEL: flat_agent_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5559,6 +6979,12 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -5580,6 +7006,12 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_agent_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -5601,7 +7033,13 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg( ; GFX1250-LABEL: flat_agent_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -5635,11 +7073,16 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -5649,6 +7092,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -5667,11 +7111,16 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -5681,6 +7130,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -5702,11 +7152,16 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -5716,6 +7171,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -5733,11 +7189,16 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_agent_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -5747,6 +7208,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -5762,6 +7224,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5781,6 +7249,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5798,6 +7272,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_agent_acq_rel_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5816,6 +7296,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_agent_acq_rel_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5834,6 +7320,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg( ; GFX11-WGP-LABEL: flat_agent_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5855,6 +7347,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg( ; GFX11-CU-LABEL: flat_agent_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5876,6 +7374,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -5897,6 +7401,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_agent_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -5918,7 +7428,13 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg( ; GFX1250-LABEL: flat_agent_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -5952,11 +7468,16 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -5966,6 +7487,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -5984,11 +7506,16 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -5998,6 +7525,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -6019,11 +7547,16 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -6033,6 +7566,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -6050,11 +7584,16 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_agent_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -6064,6 +7603,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -6079,6 +7619,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6098,6 +7644,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6115,6 +7667,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_agent_seq_cst_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6133,6 +7691,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_agent_seq_cst_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6151,6 +7715,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg( ; GFX11-WGP-LABEL: flat_agent_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -6172,6 +7742,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg( ; GFX11-CU-LABEL: flat_agent_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6193,6 +7769,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -6214,6 +7796,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_agent_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -6235,7 +7823,13 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg( ; GFX1250-LABEL: flat_agent_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -6269,11 +7863,16 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -6283,6 +7882,7 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -6301,11 +7901,16 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -6315,6 +7920,7 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -6336,11 +7942,16 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -6350,6 +7961,7 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -6367,11 +7979,16 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_agent_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -6381,6 +7998,7 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -6396,6 +8014,12 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6415,6 +8039,12 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6432,6 +8062,12 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_agent_monotonic_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6450,6 +8086,12 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_agent_monotonic_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6468,6 +8110,12 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: flat_agent_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -6489,6 +8137,12 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg( ; GFX11-CU-LABEL: flat_agent_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6510,6 +8164,12 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -6531,6 +8191,12 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_agent_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -6552,7 +8218,13 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg( ; GFX1250-LABEL: flat_agent_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -6586,11 +8258,16 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -6600,6 +8277,7 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -6618,11 +8296,16 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -6632,6 +8315,7 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -6653,11 +8337,16 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -6667,6 +8356,7 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -6684,11 +8374,16 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_agent_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -6698,6 +8393,7 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -6713,6 +8409,12 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6732,6 +8434,12 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6749,6 +8457,12 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_agent_acquire_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6767,6 +8481,12 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_agent_acquire_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6785,6 +8505,12 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: flat_agent_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -6806,6 +8532,12 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg( ; GFX11-CU-LABEL: flat_agent_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6827,6 +8559,12 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -6848,6 +8586,12 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_agent_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -6869,7 +8613,13 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg( ; GFX1250-LABEL: flat_agent_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -6903,11 +8653,16 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -6917,6 +8672,7 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -6935,11 +8691,16 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -6949,6 +8710,7 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -6970,11 +8732,16 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -6984,6 +8751,7 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -7001,11 +8769,16 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_agent_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -7015,6 +8788,7 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -7030,6 +8804,12 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7049,6 +8829,12 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7066,6 +8852,12 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_agent_release_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7084,6 +8876,12 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_agent_release_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7102,6 +8900,12 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: flat_agent_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7123,6 +8927,12 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg( ; GFX11-CU-LABEL: flat_agent_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -7144,6 +8954,12 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -7165,6 +8981,12 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_agent_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -7186,7 +9008,13 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg( ; GFX1250-LABEL: flat_agent_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -7220,11 +9048,16 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -7234,6 +9067,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -7252,11 +9086,16 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -7266,6 +9105,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -7287,11 +9127,16 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -7301,6 +9146,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -7318,11 +9164,16 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -7332,6 +9183,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -7347,6 +9199,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7366,6 +9224,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7383,6 +9247,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7401,6 +9271,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7419,6 +9295,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7440,6 +9322,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg( ; GFX11-CU-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -7461,6 +9349,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -7482,6 +9376,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -7503,7 +9403,13 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg( ; GFX1250-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -7537,11 +9443,16 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -7551,6 +9462,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -7569,11 +9481,16 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -7583,6 +9500,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -7604,11 +9522,16 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -7618,6 +9541,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -7635,11 +9559,16 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -7649,6 +9578,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -7664,6 +9594,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7683,6 +9619,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7700,6 +9642,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7718,6 +9666,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7736,6 +9690,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7757,6 +9717,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg( ; GFX11-CU-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -7778,6 +9744,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -7799,6 +9771,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -7820,7 +9798,13 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg( ; GFX1250-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -7855,6 +9839,12 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -7888,6 +9878,12 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -7921,6 +9917,12 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -7950,6 +9952,12 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -7980,6 +9988,12 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7999,6 +10013,12 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8016,6 +10036,12 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8033,6 +10059,12 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8050,6 +10082,12 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -8069,6 +10107,12 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -8088,6 +10132,12 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -8107,6 +10157,12 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -8126,7 +10182,13 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg( ; GFX1250-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -8157,6 +10219,12 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -8191,6 +10259,12 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -8226,6 +10300,12 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -8257,6 +10337,12 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -8287,6 +10373,12 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8307,6 +10399,12 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8325,6 +10423,12 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8343,6 +10447,12 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8361,6 +10471,12 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -8382,6 +10498,12 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -8403,6 +10525,12 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -8423,6 +10551,12 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -8443,7 +10577,13 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg( ; GFX1250-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -8476,6 +10616,12 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -8510,6 +10656,12 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -8545,6 +10697,12 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -8576,6 +10734,12 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -8607,6 +10771,12 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8627,6 +10797,12 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8645,6 +10821,12 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_agent_release_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8664,6 +10846,12 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_agent_release_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8683,6 +10871,12 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_agent_release_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -8704,6 +10898,12 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: flat_agent_release_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -8725,6 +10925,12 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -8748,6 +10954,12 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_agent_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -8771,7 +10983,13 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg( ; GFX1250-LABEL: flat_agent_release_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -8807,6 +11025,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -8842,6 +11066,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -8879,6 +11109,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -8912,6 +11148,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -8943,6 +11185,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8964,6 +11212,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8983,6 +11237,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9003,6 +11263,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9023,6 +11289,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -9046,6 +11318,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -9069,6 +11347,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -9093,6 +11377,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -9117,7 +11407,13 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX1250-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -9155,6 +11451,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -9190,6 +11492,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -9227,6 +11535,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -9260,6 +11574,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -9291,6 +11611,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9312,6 +11638,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9331,6 +11663,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9351,6 +11689,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9371,6 +11715,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -9394,6 +11744,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -9417,6 +11773,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -9441,6 +11803,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -9465,7 +11833,13 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX1250-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -9503,6 +11877,12 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -9537,6 +11917,12 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -9572,6 +11958,12 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -9603,6 +11995,12 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -9633,6 +12031,12 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9653,6 +12057,12 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9671,6 +12081,12 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9689,6 +12105,12 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9707,6 +12129,12 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -9728,6 +12156,12 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -9749,6 +12183,12 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -9769,6 +12209,12 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -9789,7 +12235,13 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg( ; GFX1250-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -9822,6 +12274,12 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -9856,6 +12314,12 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -9891,6 +12355,12 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -9922,6 +12392,12 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -9952,6 +12428,12 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9972,6 +12454,12 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9990,6 +12478,12 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_agent_acquire_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10008,6 +12502,12 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_agent_acquire_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10026,6 +12526,12 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_agent_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10047,6 +12553,12 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: flat_agent_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10068,6 +12580,12 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -10088,6 +12606,12 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_agent_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -10108,7 +12632,13 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg( ; GFX1250-LABEL: flat_agent_acquire_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -10141,6 +12671,12 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -10176,6 +12712,12 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -10213,6 +12755,12 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -10246,6 +12794,12 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -10277,6 +12831,12 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10298,6 +12858,12 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10317,6 +12883,12 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_agent_release_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10337,6 +12909,12 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_agent_release_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10357,6 +12935,12 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_agent_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10380,6 +12964,12 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: flat_agent_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10403,6 +12993,12 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -10427,6 +13023,12 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_agent_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -10451,7 +13053,13 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg( ; GFX1250-LABEL: flat_agent_release_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -10489,6 +13097,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -10524,6 +13138,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -10561,6 +13181,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -10594,6 +13220,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -10625,6 +13257,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10646,6 +13284,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10665,6 +13309,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10685,6 +13335,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10705,6 +13361,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10728,6 +13390,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10751,6 +13419,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -10775,6 +13449,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -10799,7 +13479,13 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg( ; GFX1250-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -10837,6 +13523,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -10872,6 +13564,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -10909,6 +13607,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -10942,6 +13646,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -10973,6 +13683,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10994,6 +13710,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11013,6 +13735,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11033,6 +13761,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11053,6 +13787,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11076,6 +13816,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11099,6 +13845,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -11123,6 +13875,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -11147,7 +13905,13 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg( ; GFX1250-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -11185,6 +13949,12 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -11220,6 +13990,12 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -11257,6 +14033,12 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -11290,6 +14072,12 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -11321,6 +14109,12 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11342,6 +14136,12 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11361,6 +14161,12 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11381,6 +14187,12 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11401,6 +14213,12 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11424,6 +14242,12 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11447,6 +14271,12 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -11471,6 +14301,12 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -11495,7 +14331,13 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -11533,6 +14375,12 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -11568,6 +14416,12 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -11605,6 +14459,12 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -11638,6 +14498,12 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -11669,6 +14535,12 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11690,6 +14562,12 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11709,6 +14587,12 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11729,6 +14613,12 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11749,6 +14639,12 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11772,6 +14668,12 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11795,6 +14697,12 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -11819,6 +14727,12 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -11843,7 +14757,13 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -11881,6 +14801,12 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -11916,6 +14842,12 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -11953,6 +14885,12 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -11986,6 +14924,12 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -12017,6 +14961,12 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12038,6 +14988,12 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12057,6 +15013,12 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_agent_release_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12077,6 +15039,12 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_agent_release_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12097,6 +15065,12 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_agent_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12120,6 +15094,12 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: flat_agent_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12143,6 +15123,12 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -12167,6 +15153,12 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_agent_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -12191,7 +15183,13 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: flat_agent_release_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -12229,6 +15227,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -12264,6 +15268,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -12301,6 +15311,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -12334,6 +15350,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -12365,6 +15387,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12386,6 +15414,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12405,6 +15439,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12425,6 +15465,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12445,6 +15491,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12468,6 +15520,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12491,6 +15549,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -12515,6 +15579,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -12539,7 +15609,13 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -12577,6 +15653,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -12612,6 +15694,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -12649,6 +15737,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -12682,6 +15776,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -12713,6 +15813,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12734,6 +15840,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12753,6 +15865,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12773,6 +15891,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12793,6 +15917,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12816,6 +15946,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12839,6 +15975,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -12863,6 +16005,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -12887,7 +16035,13 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -12923,15 +16077,19 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -12941,15 +16099,19 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -12959,29 +16121,37 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_unordered_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -12989,13 +16159,17 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -13003,11 +16177,15 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -13015,23 +16193,31 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load( ; ; GFX942-NOTTGSPLIT-LABEL: flat_agent_one_as_unordered_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_agent_one_as_unordered_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -13039,40 +16225,52 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load( ; ; GFX11-WGP-LABEL: flat_agent_one_as_unordered_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_agent_one_as_unordered_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_agent_one_as_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -13081,12 +16279,16 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load( ; ; GFX12-CU-LABEL: flat_agent_one_as_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -13096,12 +16298,16 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load( ; GFX1250-LABEL: flat_agent_one_as_unordered_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { @@ -13117,15 +16323,19 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] glc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -13135,15 +16345,19 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc dlc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -13153,29 +16367,37 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] glc dlc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_monotonic_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -13183,13 +16405,17 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -13197,11 +16423,15 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -13209,23 +16439,31 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load( ; ; GFX942-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_agent_one_as_monotonic_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -13233,40 +16471,52 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load( ; ; GFX11-WGP-LABEL: flat_agent_one_as_monotonic_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_agent_one_as_monotonic_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_agent_one_as_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -13275,12 +16525,16 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load( ; ; GFX12-CU-LABEL: flat_agent_one_as_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -13290,12 +16544,16 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load( ; GFX1250-LABEL: flat_agent_one_as_monotonic_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { @@ -13311,17 +16569,20 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -13331,18 +16592,21 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc dlc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -13352,33 +16616,38 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] glc dlc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acquire_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] glc -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -13386,15 +16655,18 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -13402,88 +16674,109 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc1 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 -; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_agent_one_as_acquire_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_agent_one_as_acquire_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_agent_one_as_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_agent_one_as_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0 @@ -13492,14 +16785,18 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load( ; ; GFX12-CU-LABEL: flat_agent_one_as_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 @@ -13509,14 +16806,18 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load( ; GFX1250-LABEL: flat_agent_one_as_acquire_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { @@ -13532,18 +16833,21 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_load_dword v2, v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -13553,9 +16857,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) @@ -13564,9 +16871,9 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -13576,9 +16883,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) @@ -13587,25 +16897,27 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_seq_cst_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] glc -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -13613,16 +16925,19 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -13630,52 +16945,66 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc1 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 -; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_agent_one_as_seq_cst_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) @@ -13684,17 +17013,20 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_agent_one_as_seq_cst_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) @@ -13703,17 +17035,20 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_agent_one_as_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 @@ -13723,6 +17058,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load( ; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0 @@ -13731,9 +17067,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load( ; ; GFX12-CU-LABEL: flat_agent_one_as_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 @@ -13743,6 +17082,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load( ; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 @@ -13752,16 +17092,20 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load( ; GFX1250-LABEL: flat_agent_one_as_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { @@ -13778,6 +17122,10 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -13793,6 +17141,10 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 @@ -13808,6 +17160,10 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 @@ -13819,6 +17175,10 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store( ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_unordered_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -13832,6 +17192,10 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -13844,6 +17208,10 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -13854,6 +17222,10 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store( ; GFX942-NOTTGSPLIT-LABEL: flat_agent_one_as_unordered_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -13864,6 +17236,10 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store( ; GFX942-TGSPLIT-LABEL: flat_agent_one_as_unordered_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -13874,6 +17250,10 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store( ; GFX11-WGP-LABEL: flat_agent_one_as_unordered_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -13885,6 +17265,10 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store( ; GFX11-CU-LABEL: flat_agent_one_as_unordered_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -13896,6 +17280,10 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store( ; GFX12-WGP-LABEL: flat_agent_one_as_unordered_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -13907,6 +17295,10 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store( ; GFX12-CU-LABEL: flat_agent_one_as_unordered_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -13918,12 +17310,16 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store( ; GFX1250-LABEL: flat_agent_one_as_unordered_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { @@ -13939,6 +17335,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -13954,6 +17354,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 @@ -13969,6 +17373,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 @@ -13980,6 +17388,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store( ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_monotonic_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -13993,6 +17405,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -14005,6 +17421,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -14015,6 +17435,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store( ; GFX942-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -14025,6 +17449,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store( ; GFX942-TGSPLIT-LABEL: flat_agent_one_as_monotonic_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -14035,6 +17463,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store( ; GFX11-WGP-LABEL: flat_agent_one_as_monotonic_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -14046,6 +17478,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store( ; GFX11-CU-LABEL: flat_agent_one_as_monotonic_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -14057,6 +17493,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store( ; GFX12-WGP-LABEL: flat_agent_one_as_monotonic_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -14068,6 +17508,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store( ; GFX12-CU-LABEL: flat_agent_one_as_monotonic_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -14079,12 +17523,16 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store( ; GFX1250-LABEL: flat_agent_one_as_monotonic_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { @@ -14100,6 +17548,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -14116,6 +17568,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_store( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 @@ -14133,6 +17589,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_store( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 @@ -14146,6 +17606,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_store( ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_release_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -14160,6 +17624,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_store( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -14173,6 +17641,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_store( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -14184,6 +17656,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_store( ; GFX942-NOTTGSPLIT-LABEL: flat_agent_one_as_release_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -14196,6 +17672,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_store( ; GFX942-TGSPLIT-LABEL: flat_agent_one_as_release_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -14208,6 +17688,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_store( ; GFX11-WGP-LABEL: flat_agent_one_as_release_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -14221,6 +17705,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_store( ; GFX11-CU-LABEL: flat_agent_one_as_release_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -14234,6 +17722,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_store( ; GFX12-WGP-LABEL: flat_agent_one_as_release_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -14249,6 +17741,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_store( ; GFX12-CU-LABEL: flat_agent_one_as_release_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -14264,10 +17760,13 @@ define amdgpu_kernel void @flat_agent_one_as_release_store( ; GFX1250-LABEL: flat_agent_one_as_release_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -14275,6 +17774,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_store( ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { @@ -14290,6 +17790,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -14306,6 +17810,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 @@ -14323,6 +17831,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 @@ -14336,6 +17848,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store( ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_seq_cst_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -14350,6 +17866,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -14363,6 +17883,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -14374,6 +17898,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store( ; GFX942-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -14386,6 +17914,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store( ; GFX942-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -14398,6 +17930,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store( ; GFX11-WGP-LABEL: flat_agent_one_as_seq_cst_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -14411,6 +17947,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store( ; GFX11-CU-LABEL: flat_agent_one_as_seq_cst_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -14424,6 +17964,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store( ; GFX12-WGP-LABEL: flat_agent_one_as_seq_cst_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -14439,6 +17983,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store( ; GFX12-CU-LABEL: flat_agent_one_as_seq_cst_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -14454,10 +18002,13 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store( ; GFX1250-LABEL: flat_agent_one_as_seq_cst_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -14465,6 +18016,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store( ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { @@ -14479,11 +18031,15 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm @@ -14494,11 +18050,15 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm @@ -14509,22 +18069,30 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_monotonic_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -14533,10 +18101,14 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -14545,74 +18117,102 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_agent_one_as_monotonic_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_agent_one_as_monotonic_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_agent_one_as_monotonic_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_agent_one_as_monotonic_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_agent_one_as_monotonic_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -14620,7 +18220,11 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw( ; GFX1250-LABEL: flat_agent_one_as_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -14640,11 +18244,15 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -14657,11 +18265,15 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -14675,11 +18287,15 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -14689,11 +18305,15 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acquire_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -14703,10 +18323,14 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -14717,10 +18341,14 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -14729,10 +18357,14 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -14741,10 +18373,14 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: flat_agent_one_as_acquire_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -14753,11 +18389,15 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw( ; ; GFX11-WGP-LABEL: flat_agent_one_as_acquire_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -14767,11 +18407,15 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw( ; ; GFX11-CU-LABEL: flat_agent_one_as_acquire_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -14781,11 +18425,15 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw( ; ; GFX12-WGP-LABEL: flat_agent_one_as_acquire_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -14794,11 +18442,15 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw( ; ; GFX12-CU-LABEL: flat_agent_one_as_acquire_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 @@ -14808,7 +18460,11 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw( ; GFX1250-LABEL: flat_agent_one_as_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -14831,11 +18487,15 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 @@ -14847,11 +18507,15 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -14864,11 +18528,15 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -14877,11 +18545,15 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_release_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 @@ -14891,10 +18563,14 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 @@ -14904,10 +18580,14 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 @@ -14915,10 +18595,14 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: flat_agent_one_as_release_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -14927,10 +18611,14 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: flat_agent_one_as_release_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -14939,11 +18627,15 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw( ; ; GFX11-WGP-LABEL: flat_agent_one_as_release_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -14952,11 +18644,15 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw( ; ; GFX11-CU-LABEL: flat_agent_one_as_release_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -14965,11 +18661,15 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw( ; ; GFX12-WGP-LABEL: flat_agent_one_as_release_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 @@ -14980,11 +18680,15 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw( ; ; GFX12-CU-LABEL: flat_agent_one_as_release_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 @@ -14996,7 +18700,11 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw( ; GFX1250-LABEL: flat_agent_one_as_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -15021,11 +18729,15 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 @@ -15039,11 +18751,15 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -15059,11 +18775,15 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -15075,11 +18795,15 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acq_rel_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 @@ -15090,10 +18814,14 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 @@ -15105,10 +18833,14 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 @@ -15118,10 +18850,14 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -15132,10 +18868,14 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -15146,11 +18886,15 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw( ; ; GFX11-WGP-LABEL: flat_agent_one_as_acq_rel_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -15162,11 +18906,15 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw( ; ; GFX11-CU-LABEL: flat_agent_one_as_acq_rel_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -15178,11 +18926,15 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw( ; ; GFX12-WGP-LABEL: flat_agent_one_as_acq_rel_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 @@ -15195,11 +18947,15 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw( ; ; GFX12-CU-LABEL: flat_agent_one_as_acq_rel_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 @@ -15213,7 +18969,11 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw( ; GFX1250-LABEL: flat_agent_one_as_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -15241,11 +19001,15 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 @@ -15259,11 +19023,15 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -15279,11 +19047,15 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -15295,11 +19067,15 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_seq_cst_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 @@ -15310,10 +19086,14 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 @@ -15325,10 +19105,14 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 @@ -15338,10 +19122,14 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -15352,10 +19140,14 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -15366,11 +19158,15 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw( ; ; GFX11-WGP-LABEL: flat_agent_one_as_seq_cst_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -15382,11 +19178,15 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw( ; ; GFX11-CU-LABEL: flat_agent_one_as_seq_cst_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -15398,11 +19198,15 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw( ; ; GFX12-WGP-LABEL: flat_agent_one_as_seq_cst_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 @@ -15415,11 +19219,15 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw( ; ; GFX12-CU-LABEL: flat_agent_one_as_seq_cst_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 @@ -15433,7 +19241,11 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw( ; GFX1250-LABEL: flat_agent_one_as_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -15462,6 +19274,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -15483,6 +19299,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -15505,6 +19325,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -15523,6 +19347,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -15541,6 +19369,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] @@ -15558,6 +19390,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] @@ -15572,6 +19408,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -15587,6 +19427,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: flat_agent_one_as_acquire_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -15601,6 +19445,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw( ; GFX11-WGP-LABEL: flat_agent_one_as_acquire_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -15619,6 +19467,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw( ; GFX11-CU-LABEL: flat_agent_one_as_acquire_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -15637,6 +19489,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw( ; GFX12-WGP-LABEL: flat_agent_one_as_acquire_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -15654,6 +19510,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw( ; GFX12-CU-LABEL: flat_agent_one_as_acquire_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -15671,7 +19531,11 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw( ; GFX1250-LABEL: flat_agent_one_as_acquire_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -15697,6 +19561,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -15719,6 +19587,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -15743,6 +19615,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -15763,6 +19639,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -15782,6 +19662,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] @@ -15800,6 +19684,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] @@ -15815,6 +19703,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -15832,6 +19724,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -15848,6 +19744,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw( ; GFX11-WGP-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -15868,6 +19768,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw( ; GFX11-CU-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -15888,6 +19792,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw( ; GFX12-WGP-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -15909,6 +19817,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -15930,7 +19842,11 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw( ; GFX1250-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -15961,6 +19877,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -15983,6 +19903,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -16007,6 +19931,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -16027,6 +19955,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -16046,6 +19978,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] @@ -16064,6 +20000,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] @@ -16079,6 +20019,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -16096,6 +20040,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -16112,6 +20060,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw( ; GFX11-WGP-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -16132,6 +20084,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw( ; GFX11-CU-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -16152,6 +20108,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw( ; GFX12-WGP-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -16173,6 +20133,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -16194,7 +20158,11 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw( ; GFX1250-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -16225,11 +20193,16 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -16239,6 +20212,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -16254,11 +20228,16 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -16268,6 +20247,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -16283,11 +20263,16 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -16297,6 +20282,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -16308,11 +20294,16 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -16322,6 +20313,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -16335,6 +20327,12 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16351,6 +20349,12 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16365,6 +20369,12 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16379,6 +20389,12 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16393,6 +20409,12 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg( ; GFX11-WGP-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16408,6 +20430,12 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg( ; GFX11-CU-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16423,6 +20451,12 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -16438,6 +20472,12 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -16453,7 +20493,13 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg( ; GFX1250-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -16479,11 +20525,16 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -16493,6 +20544,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -16510,11 +20562,16 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -16524,6 +20581,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -16542,11 +20600,16 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -16556,6 +20619,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -16570,11 +20634,16 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -16584,6 +20653,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -16598,6 +20668,12 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16616,6 +20692,12 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16632,6 +20714,12 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16648,6 +20736,12 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16664,6 +20758,12 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg( ; GFX11-WGP-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16682,6 +20782,12 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg( ; GFX11-CU-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16700,6 +20806,12 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -16717,6 +20829,12 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -16734,7 +20852,13 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg( ; GFX1250-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -16763,11 +20887,16 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -16777,6 +20906,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -16793,11 +20923,16 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -16807,6 +20942,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -16824,11 +20960,16 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -16838,6 +20979,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -16851,11 +20993,16 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -16865,6 +21012,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -16879,6 +21027,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16896,6 +21050,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16911,6 +21071,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_agent_one_as_release_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16927,6 +21093,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_agent_one_as_release_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16943,6 +21115,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg( ; GFX11-WGP-LABEL: flat_agent_one_as_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16960,6 +21138,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg( ; GFX11-CU-LABEL: flat_agent_one_as_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16977,6 +21161,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_one_as_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -16996,6 +21186,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_agent_one_as_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -17015,7 +21211,13 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg( ; GFX1250-LABEL: flat_agent_one_as_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -17046,11 +21248,16 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -17060,6 +21267,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -17078,11 +21286,16 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -17092,6 +21305,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -17112,11 +21326,16 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -17126,6 +21345,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -17142,11 +21362,16 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -17156,6 +21381,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -17171,6 +21397,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17190,6 +21422,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17207,6 +21445,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17225,6 +21469,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17243,6 +21493,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX11-WGP-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17263,6 +21519,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX11-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17283,6 +21545,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -17304,6 +21572,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -17325,7 +21599,13 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX1250-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -17359,11 +21639,16 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -17373,6 +21658,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -17391,11 +21677,16 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -17405,6 +21696,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -17425,11 +21717,16 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -17439,6 +21736,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -17455,11 +21753,16 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -17469,6 +21772,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -17484,6 +21788,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17503,6 +21813,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17520,6 +21836,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17538,6 +21860,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17556,6 +21884,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX11-WGP-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17576,6 +21910,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX11-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17596,6 +21936,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -17617,6 +21963,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -17638,7 +21990,13 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX1250-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -17672,11 +22030,16 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -17686,6 +22049,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -17703,11 +22067,16 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -17717,6 +22086,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -17735,11 +22105,16 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -17749,6 +22124,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -17763,11 +22139,16 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -17777,6 +22158,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -17791,6 +22173,12 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17809,6 +22197,12 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17825,6 +22219,12 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17841,6 +22241,12 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17857,6 +22263,12 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg( ; GFX11-WGP-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17875,6 +22287,12 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg( ; GFX11-CU-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17893,6 +22311,12 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -17910,6 +22334,12 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -17927,7 +22357,13 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg( ; GFX1250-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -17956,11 +22392,16 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -17970,6 +22411,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -17987,11 +22429,16 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -18001,6 +22448,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -18019,11 +22467,16 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -18033,6 +22486,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -18047,11 +22501,16 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -18061,6 +22520,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -18075,6 +22535,12 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18093,6 +22559,12 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18109,6 +22581,12 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18125,6 +22603,12 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18141,6 +22625,12 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg( ; GFX11-WGP-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -18159,6 +22649,12 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg( ; GFX11-CU-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -18177,6 +22673,12 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -18194,6 +22696,12 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -18211,7 +22719,13 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg( ; GFX1250-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -18240,11 +22754,16 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -18254,6 +22773,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -18272,11 +22792,16 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -18286,6 +22811,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -18306,11 +22832,16 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -18320,6 +22851,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -18336,11 +22868,16 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -18350,6 +22887,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -18365,6 +22903,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18384,6 +22928,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18401,6 +22951,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_agent_one_as_release_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18419,6 +22975,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_agent_one_as_release_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18437,6 +22999,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg( ; GFX11-WGP-LABEL: flat_agent_one_as_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -18457,6 +23025,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg( ; GFX11-CU-LABEL: flat_agent_one_as_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -18477,6 +23051,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_one_as_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -18498,6 +23078,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_agent_one_as_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -18519,7 +23105,13 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg( ; GFX1250-LABEL: flat_agent_one_as_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -18553,11 +23145,16 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -18567,6 +23164,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -18585,11 +23183,16 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -18599,6 +23202,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -18619,11 +23223,16 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -18633,6 +23242,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -18649,11 +23259,16 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -18663,6 +23278,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -18678,6 +23294,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18697,6 +23319,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18714,6 +23342,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18732,6 +23366,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18750,6 +23390,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX11-WGP-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -18770,6 +23416,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX11-CU-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -18790,6 +23442,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -18811,6 +23469,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -18832,7 +23496,13 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX1250-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -18866,11 +23536,16 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -18880,6 +23555,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -18898,11 +23574,16 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -18912,6 +23593,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -18932,11 +23614,16 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -18946,6 +23633,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -18962,11 +23650,16 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -18976,6 +23669,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -18991,6 +23685,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19010,6 +23710,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19027,6 +23733,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19045,6 +23757,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19063,6 +23781,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX11-WGP-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -19083,6 +23807,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX11-CU-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -19103,6 +23833,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -19124,6 +23860,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -19145,7 +23887,13 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX1250-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -19179,11 +23927,16 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -19193,6 +23946,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -19211,11 +23965,16 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -19225,6 +23984,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -19245,11 +24005,16 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -19259,6 +24024,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -19275,11 +24041,16 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -19289,6 +24060,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -19304,6 +24076,12 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19323,6 +24101,12 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19340,6 +24124,12 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19358,6 +24148,12 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19376,6 +24172,12 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -19396,6 +24198,12 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX11-CU-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -19416,6 +24224,12 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -19437,6 +24251,12 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -19458,7 +24278,13 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX1250-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -19492,11 +24318,16 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -19506,6 +24337,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -19524,11 +24356,16 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -19538,6 +24375,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -19558,11 +24396,16 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -19572,6 +24415,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -19588,11 +24432,16 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -19602,6 +24451,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -19617,6 +24467,12 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19636,6 +24492,12 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19653,6 +24515,12 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19671,6 +24539,12 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19689,6 +24563,12 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -19709,6 +24589,12 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX11-CU-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -19729,6 +24615,12 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -19750,6 +24642,12 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -19771,7 +24669,13 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX1250-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -19805,11 +24709,16 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -19819,6 +24728,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -19837,11 +24747,16 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -19851,6 +24766,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -19871,11 +24787,16 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -19885,6 +24806,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -19901,11 +24823,16 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -19915,6 +24842,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -19930,6 +24858,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19949,6 +24883,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19966,6 +24906,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19984,6 +24930,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20002,6 +24954,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -20022,6 +24980,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg( ; GFX11-CU-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -20042,6 +25006,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -20063,6 +25033,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -20084,7 +25060,13 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg( ; GFX1250-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -20118,11 +25100,16 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -20132,6 +25119,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -20150,11 +25138,16 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -20164,6 +25157,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -20184,11 +25178,16 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -20198,6 +25197,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -20214,11 +25214,16 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -20228,6 +25233,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -20243,6 +25249,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20262,6 +25274,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20279,6 +25297,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20297,6 +25321,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20315,6 +25345,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -20335,6 +25371,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX11-CU-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -20355,6 +25397,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -20376,6 +25424,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -20397,7 +25451,13 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX1250-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -20431,11 +25491,16 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -20445,6 +25510,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -20463,11 +25529,16 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -20477,6 +25548,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -20497,11 +25569,16 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -20511,6 +25588,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -20527,11 +25605,16 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -20541,6 +25624,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -20556,6 +25640,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20575,6 +25665,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20592,6 +25688,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20610,6 +25712,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20628,6 +25736,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -20648,6 +25762,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX11-CU-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -20668,6 +25788,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -20689,6 +25815,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -20710,7 +25842,13 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX1250-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -20745,6 +25883,12 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -20778,6 +25922,12 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -20811,6 +25961,12 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -20840,6 +25996,12 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -20870,6 +26032,12 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20889,6 +26057,12 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20906,6 +26080,12 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20923,6 +26103,12 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20940,6 +26126,12 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -20959,6 +26151,12 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -20978,6 +26176,12 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -20997,6 +26201,12 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -21016,7 +26226,13 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX1250-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -21047,6 +26263,12 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -21082,6 +26304,12 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -21118,6 +26346,12 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -21150,6 +26384,12 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -21181,6 +26421,12 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21202,6 +26448,12 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21220,6 +26472,12 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21239,6 +26497,12 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21257,6 +26521,12 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -21279,6 +26549,12 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -21301,6 +26577,12 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -21322,6 +26604,12 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -21343,7 +26631,13 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX1250-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -21376,6 +26670,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -21410,6 +26710,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -21445,6 +26751,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -21476,6 +26788,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -21507,6 +26825,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21527,6 +26851,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21545,6 +26875,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21564,6 +26900,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21583,6 +26925,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -21604,6 +26952,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -21625,6 +26979,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -21648,6 +27008,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -21671,7 +27037,13 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg( ; GFX1250-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -21707,6 +27079,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -21743,6 +27121,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -21781,6 +27165,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -21815,6 +27205,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -21847,6 +27243,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21869,6 +27271,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21888,6 +27296,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21909,6 +27323,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21929,6 +27349,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -21953,6 +27379,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -21977,6 +27409,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -22002,6 +27440,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -22027,7 +27471,13 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX1250-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -22065,6 +27515,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -22101,6 +27557,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -22139,6 +27601,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -22173,6 +27641,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -22205,6 +27679,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -22227,6 +27707,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -22246,6 +27732,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -22267,6 +27759,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -22287,6 +27785,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -22311,6 +27815,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -22335,6 +27845,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -22360,6 +27876,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -22385,7 +27907,13 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX1250-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -22423,6 +27951,12 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -22458,6 +27992,12 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -22494,6 +28034,12 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -22526,6 +28072,12 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -22557,6 +28109,12 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -22578,6 +28136,12 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -22596,6 +28160,12 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -22615,6 +28185,12 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -22633,6 +28209,12 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -22655,6 +28237,12 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -22677,6 +28265,12 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -22698,6 +28292,12 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -22719,7 +28319,13 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX1250-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -22752,6 +28358,12 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -22787,6 +28399,12 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -22823,6 +28441,12 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -22855,6 +28479,12 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -22886,6 +28516,12 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -22907,6 +28543,12 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -22925,6 +28567,12 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -22944,6 +28592,12 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -22962,6 +28616,12 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -22984,6 +28644,12 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -23006,6 +28672,12 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -23027,6 +28699,12 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -23048,7 +28726,13 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX1250-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -23081,6 +28765,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -23117,6 +28807,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -23155,6 +28851,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -23189,6 +28891,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -23221,6 +28929,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -23243,6 +28957,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -23262,6 +28982,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -23283,6 +29009,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -23303,6 +29035,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -23327,6 +29065,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -23351,6 +29095,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -23376,6 +29126,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -23401,7 +29157,13 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg( ; GFX1250-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -23439,6 +29201,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -23475,6 +29243,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -23513,6 +29287,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -23547,6 +29327,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -23579,6 +29365,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -23601,6 +29393,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -23620,6 +29418,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -23641,6 +29445,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -23661,6 +29471,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -23685,6 +29501,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -23709,6 +29531,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -23734,6 +29562,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -23759,7 +29593,13 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX1250-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -23797,6 +29637,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -23833,6 +29679,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -23871,6 +29723,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -23905,6 +29763,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -23937,6 +29801,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -23959,6 +29829,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -23978,6 +29854,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -23999,6 +29881,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -24019,6 +29907,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -24043,6 +29937,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -24067,6 +29967,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -24092,6 +29998,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -24117,7 +30029,13 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX1250-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -24155,6 +30073,12 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -24191,6 +30115,12 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -24229,6 +30159,12 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -24263,6 +30199,12 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -24295,6 +30237,12 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -24317,6 +30265,12 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -24336,6 +30290,12 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -24357,6 +30317,12 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -24377,6 +30343,12 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -24401,6 +30373,12 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -24425,6 +30403,12 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -24450,6 +30434,12 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -24475,7 +30465,13 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -24513,6 +30509,12 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -24549,6 +30551,12 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -24587,6 +30595,12 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -24621,6 +30635,12 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -24653,6 +30673,12 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -24675,6 +30701,12 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -24694,6 +30726,12 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -24715,6 +30753,12 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -24735,6 +30779,12 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -24759,6 +30809,12 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -24783,6 +30839,12 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -24808,6 +30870,12 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -24833,7 +30901,13 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -24871,6 +30945,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -24907,6 +30987,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -24945,6 +31031,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -24979,6 +31071,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -25011,6 +31109,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -25033,6 +31137,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -25052,6 +31162,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -25073,6 +31189,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -25093,6 +31215,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -25117,6 +31245,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -25141,6 +31275,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -25166,6 +31306,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -25191,7 +31337,13 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -25229,6 +31381,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -25265,6 +31423,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -25303,6 +31467,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -25337,6 +31507,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -25369,6 +31545,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -25391,6 +31573,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -25410,6 +31598,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -25431,6 +31625,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -25451,6 +31651,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -25475,6 +31681,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -25499,6 +31711,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -25524,6 +31742,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -25549,7 +31773,13 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -25587,6 +31817,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -25623,6 +31859,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -25661,6 +31903,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -25695,6 +31943,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -25727,6 +31981,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -25749,6 +32009,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -25768,6 +32034,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -25789,6 +32061,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -25809,6 +32087,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -25833,6 +32117,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -25857,6 +32147,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -25882,6 +32178,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -25907,7 +32209,13 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-cluster.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-cluster.ll index 2848dadc7b5d..26f616110122 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-cluster.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-cluster.ll @@ -19,15 +19,19 @@ define amdgpu_kernel void @flat_cluster_unordered_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -37,15 +41,19 @@ define amdgpu_kernel void @flat_cluster_unordered_load( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -55,29 +63,37 @@ define amdgpu_kernel void @flat_cluster_unordered_load( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_cluster_unordered_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -85,13 +101,17 @@ define amdgpu_kernel void @flat_cluster_unordered_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -99,11 +119,15 @@ define amdgpu_kernel void @flat_cluster_unordered_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -111,23 +135,31 @@ define amdgpu_kernel void @flat_cluster_unordered_load( ; ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_unordered_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_cluster_unordered_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -135,40 +167,52 @@ define amdgpu_kernel void @flat_cluster_unordered_load( ; ; GFX11-WGP-LABEL: flat_cluster_unordered_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_cluster_unordered_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_cluster_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -177,12 +221,16 @@ define amdgpu_kernel void @flat_cluster_unordered_load( ; ; GFX12-CU-LABEL: flat_cluster_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -192,12 +240,16 @@ define amdgpu_kernel void @flat_cluster_unordered_load( ; GFX1250-LABEL: flat_cluster_unordered_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { @@ -213,15 +265,19 @@ define amdgpu_kernel void @flat_cluster_monotonic_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] glc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -231,15 +287,19 @@ define amdgpu_kernel void @flat_cluster_monotonic_load( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc dlc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -249,29 +309,37 @@ define amdgpu_kernel void @flat_cluster_monotonic_load( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] glc dlc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_cluster_monotonic_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -279,13 +347,17 @@ define amdgpu_kernel void @flat_cluster_monotonic_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -293,11 +365,15 @@ define amdgpu_kernel void @flat_cluster_monotonic_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -305,23 +381,31 @@ define amdgpu_kernel void @flat_cluster_monotonic_load( ; ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_monotonic_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_cluster_monotonic_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -329,40 +413,52 @@ define amdgpu_kernel void @flat_cluster_monotonic_load( ; ; GFX11-WGP-LABEL: flat_cluster_monotonic_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_cluster_monotonic_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_cluster_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -371,12 +467,16 @@ define amdgpu_kernel void @flat_cluster_monotonic_load( ; ; GFX12-CU-LABEL: flat_cluster_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -386,12 +486,16 @@ define amdgpu_kernel void @flat_cluster_monotonic_load( ; GFX1250-LABEL: flat_cluster_monotonic_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SE ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { @@ -407,9 +511,12 @@ define amdgpu_kernel void @flat_cluster_acquire_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] glc @@ -426,9 +533,12 @@ define amdgpu_kernel void @flat_cluster_acquire_load( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc dlc @@ -446,9 +556,12 @@ define amdgpu_kernel void @flat_cluster_acquire_load( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] glc dlc @@ -462,9 +575,12 @@ define amdgpu_kernel void @flat_cluster_acquire_load( ; ; SKIP-CACHE-INV-LABEL: flat_cluster_acquire_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] glc @@ -478,9 +594,12 @@ define amdgpu_kernel void @flat_cluster_acquire_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -493,22 +612,29 @@ define amdgpu_kernel void @flat_cluster_acquire_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_acquire_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc1 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -519,22 +645,29 @@ define amdgpu_kernel void @flat_cluster_acquire_load( ; ; GFX942-TGSPLIT-LABEL: flat_cluster_acquire_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_cluster_acquire_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc @@ -548,9 +681,12 @@ define amdgpu_kernel void @flat_cluster_acquire_load( ; ; GFX11-CU-LABEL: flat_cluster_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc @@ -564,14 +700,18 @@ define amdgpu_kernel void @flat_cluster_acquire_load( ; ; GFX12-WGP-LABEL: flat_cluster_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -579,14 +719,18 @@ define amdgpu_kernel void @flat_cluster_acquire_load( ; ; GFX12-CU-LABEL: flat_cluster_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -595,13 +739,17 @@ define amdgpu_kernel void @flat_cluster_acquire_load( ; GFX1250-LABEL: flat_cluster_acquire_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SE ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { @@ -617,9 +765,12 @@ define amdgpu_kernel void @flat_cluster_seq_cst_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -637,9 +788,12 @@ define amdgpu_kernel void @flat_cluster_seq_cst_load( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -659,9 +813,12 @@ define amdgpu_kernel void @flat_cluster_seq_cst_load( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -677,9 +834,12 @@ define amdgpu_kernel void @flat_cluster_seq_cst_load( ; ; SKIP-CACHE-INV-LABEL: flat_cluster_seq_cst_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -694,9 +854,12 @@ define amdgpu_kernel void @flat_cluster_seq_cst_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc @@ -710,9 +873,12 @@ define amdgpu_kernel void @flat_cluster_seq_cst_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc @@ -724,9 +890,12 @@ define amdgpu_kernel void @flat_cluster_seq_cst_load( ; ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_seq_cst_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc1 @@ -738,9 +907,12 @@ define amdgpu_kernel void @flat_cluster_seq_cst_load( ; ; GFX942-TGSPLIT-LABEL: flat_cluster_seq_cst_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc1 @@ -752,9 +924,12 @@ define amdgpu_kernel void @flat_cluster_seq_cst_load( ; ; GFX11-WGP-LABEL: flat_cluster_seq_cst_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -770,9 +945,12 @@ define amdgpu_kernel void @flat_cluster_seq_cst_load( ; ; GFX11-CU-LABEL: flat_cluster_seq_cst_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -788,9 +966,12 @@ define amdgpu_kernel void @flat_cluster_seq_cst_load( ; ; GFX12-WGP-LABEL: flat_cluster_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 @@ -800,6 +981,7 @@ define amdgpu_kernel void @flat_cluster_seq_cst_load( ; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -807,9 +989,12 @@ define amdgpu_kernel void @flat_cluster_seq_cst_load( ; ; GFX12-CU-LABEL: flat_cluster_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 @@ -819,6 +1004,7 @@ define amdgpu_kernel void @flat_cluster_seq_cst_load( ; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -827,15 +1013,19 @@ define amdgpu_kernel void @flat_cluster_seq_cst_load( ; GFX1250-LABEL: flat_cluster_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SE ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { @@ -852,6 +1042,10 @@ define amdgpu_kernel void @flat_cluster_unordered_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -867,6 +1061,10 @@ define amdgpu_kernel void @flat_cluster_unordered_store( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 @@ -882,6 +1080,10 @@ define amdgpu_kernel void @flat_cluster_unordered_store( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 @@ -893,6 +1095,10 @@ define amdgpu_kernel void @flat_cluster_unordered_store( ; SKIP-CACHE-INV-LABEL: flat_cluster_unordered_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -906,6 +1112,10 @@ define amdgpu_kernel void @flat_cluster_unordered_store( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -918,6 +1128,10 @@ define amdgpu_kernel void @flat_cluster_unordered_store( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -928,6 +1142,10 @@ define amdgpu_kernel void @flat_cluster_unordered_store( ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_unordered_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -938,6 +1156,10 @@ define amdgpu_kernel void @flat_cluster_unordered_store( ; GFX942-TGSPLIT-LABEL: flat_cluster_unordered_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -948,6 +1170,10 @@ define amdgpu_kernel void @flat_cluster_unordered_store( ; GFX11-WGP-LABEL: flat_cluster_unordered_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -959,6 +1185,10 @@ define amdgpu_kernel void @flat_cluster_unordered_store( ; GFX11-CU-LABEL: flat_cluster_unordered_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -970,6 +1200,10 @@ define amdgpu_kernel void @flat_cluster_unordered_store( ; GFX12-WGP-LABEL: flat_cluster_unordered_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -981,6 +1215,10 @@ define amdgpu_kernel void @flat_cluster_unordered_store( ; GFX12-CU-LABEL: flat_cluster_unordered_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -992,12 +1230,16 @@ define amdgpu_kernel void @flat_cluster_unordered_store( ; GFX1250-LABEL: flat_cluster_unordered_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { @@ -1013,6 +1255,10 @@ define amdgpu_kernel void @flat_cluster_monotonic_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -1028,6 +1274,10 @@ define amdgpu_kernel void @flat_cluster_monotonic_store( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 @@ -1043,6 +1293,10 @@ define amdgpu_kernel void @flat_cluster_monotonic_store( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 @@ -1054,6 +1308,10 @@ define amdgpu_kernel void @flat_cluster_monotonic_store( ; SKIP-CACHE-INV-LABEL: flat_cluster_monotonic_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -1067,6 +1325,10 @@ define amdgpu_kernel void @flat_cluster_monotonic_store( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -1079,6 +1341,10 @@ define amdgpu_kernel void @flat_cluster_monotonic_store( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -1089,6 +1355,10 @@ define amdgpu_kernel void @flat_cluster_monotonic_store( ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_monotonic_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -1099,6 +1369,10 @@ define amdgpu_kernel void @flat_cluster_monotonic_store( ; GFX942-TGSPLIT-LABEL: flat_cluster_monotonic_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -1109,6 +1383,10 @@ define amdgpu_kernel void @flat_cluster_monotonic_store( ; GFX11-WGP-LABEL: flat_cluster_monotonic_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -1120,6 +1398,10 @@ define amdgpu_kernel void @flat_cluster_monotonic_store( ; GFX11-CU-LABEL: flat_cluster_monotonic_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -1131,6 +1413,10 @@ define amdgpu_kernel void @flat_cluster_monotonic_store( ; GFX12-WGP-LABEL: flat_cluster_monotonic_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -1142,6 +1428,10 @@ define amdgpu_kernel void @flat_cluster_monotonic_store( ; GFX12-CU-LABEL: flat_cluster_monotonic_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -1153,12 +1443,16 @@ define amdgpu_kernel void @flat_cluster_monotonic_store( ; GFX1250-LABEL: flat_cluster_monotonic_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { @@ -1174,6 +1468,10 @@ define amdgpu_kernel void @flat_cluster_release_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -1190,6 +1488,10 @@ define amdgpu_kernel void @flat_cluster_release_store( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 @@ -1207,6 +1509,10 @@ define amdgpu_kernel void @flat_cluster_release_store( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 @@ -1220,6 +1526,10 @@ define amdgpu_kernel void @flat_cluster_release_store( ; SKIP-CACHE-INV-LABEL: flat_cluster_release_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -1234,6 +1544,10 @@ define amdgpu_kernel void @flat_cluster_release_store( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -1247,6 +1561,10 @@ define amdgpu_kernel void @flat_cluster_release_store( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -1258,6 +1576,10 @@ define amdgpu_kernel void @flat_cluster_release_store( ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_release_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -1270,6 +1592,10 @@ define amdgpu_kernel void @flat_cluster_release_store( ; GFX942-TGSPLIT-LABEL: flat_cluster_release_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -1282,6 +1608,10 @@ define amdgpu_kernel void @flat_cluster_release_store( ; GFX11-WGP-LABEL: flat_cluster_release_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -1295,6 +1625,10 @@ define amdgpu_kernel void @flat_cluster_release_store( ; GFX11-CU-LABEL: flat_cluster_release_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -1308,6 +1642,10 @@ define amdgpu_kernel void @flat_cluster_release_store( ; GFX12-WGP-LABEL: flat_cluster_release_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -1323,6 +1661,10 @@ define amdgpu_kernel void @flat_cluster_release_store( ; GFX12-CU-LABEL: flat_cluster_release_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -1338,14 +1680,18 @@ define amdgpu_kernel void @flat_cluster_release_store( ; GFX1250-LABEL: flat_cluster_release_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { @@ -1361,6 +1707,10 @@ define amdgpu_kernel void @flat_cluster_seq_cst_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -1377,6 +1727,10 @@ define amdgpu_kernel void @flat_cluster_seq_cst_store( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 @@ -1394,6 +1748,10 @@ define amdgpu_kernel void @flat_cluster_seq_cst_store( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 @@ -1407,6 +1765,10 @@ define amdgpu_kernel void @flat_cluster_seq_cst_store( ; SKIP-CACHE-INV-LABEL: flat_cluster_seq_cst_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -1421,6 +1783,10 @@ define amdgpu_kernel void @flat_cluster_seq_cst_store( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -1434,6 +1800,10 @@ define amdgpu_kernel void @flat_cluster_seq_cst_store( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -1445,6 +1815,10 @@ define amdgpu_kernel void @flat_cluster_seq_cst_store( ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_seq_cst_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -1457,6 +1831,10 @@ define amdgpu_kernel void @flat_cluster_seq_cst_store( ; GFX942-TGSPLIT-LABEL: flat_cluster_seq_cst_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -1469,6 +1847,10 @@ define amdgpu_kernel void @flat_cluster_seq_cst_store( ; GFX11-WGP-LABEL: flat_cluster_seq_cst_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -1482,6 +1864,10 @@ define amdgpu_kernel void @flat_cluster_seq_cst_store( ; GFX11-CU-LABEL: flat_cluster_seq_cst_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -1495,6 +1881,10 @@ define amdgpu_kernel void @flat_cluster_seq_cst_store( ; GFX12-WGP-LABEL: flat_cluster_seq_cst_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -1510,6 +1900,10 @@ define amdgpu_kernel void @flat_cluster_seq_cst_store( ; GFX12-CU-LABEL: flat_cluster_seq_cst_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -1525,14 +1919,18 @@ define amdgpu_kernel void @flat_cluster_seq_cst_store( ; GFX1250-LABEL: flat_cluster_seq_cst_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { @@ -1547,11 +1945,15 @@ define amdgpu_kernel void @flat_cluster_monotonic_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm @@ -1562,11 +1964,15 @@ define amdgpu_kernel void @flat_cluster_monotonic_atomicrmw( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm @@ -1577,22 +1983,30 @@ define amdgpu_kernel void @flat_cluster_monotonic_atomicrmw( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_cluster_monotonic_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -1601,10 +2015,14 @@ define amdgpu_kernel void @flat_cluster_monotonic_atomicrmw( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -1613,74 +2031,102 @@ define amdgpu_kernel void @flat_cluster_monotonic_atomicrmw( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_monotonic_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_cluster_monotonic_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_cluster_monotonic_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_cluster_monotonic_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_cluster_monotonic_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_cluster_monotonic_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -1688,7 +2134,11 @@ define amdgpu_kernel void @flat_cluster_monotonic_atomicrmw( ; GFX1250-LABEL: flat_cluster_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -1708,11 +2158,15 @@ define amdgpu_kernel void @flat_cluster_acquire_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1725,11 +2179,15 @@ define amdgpu_kernel void @flat_cluster_acquire_atomicrmw( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1744,11 +2202,15 @@ define amdgpu_kernel void @flat_cluster_acquire_atomicrmw( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1759,11 +2221,15 @@ define amdgpu_kernel void @flat_cluster_acquire_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_cluster_acquire_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1773,10 +2239,14 @@ define amdgpu_kernel void @flat_cluster_acquire_atomicrmw( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1787,10 +2257,14 @@ define amdgpu_kernel void @flat_cluster_acquire_atomicrmw( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -1799,10 +2273,14 @@ define amdgpu_kernel void @flat_cluster_acquire_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_acquire_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1811,10 +2289,14 @@ define amdgpu_kernel void @flat_cluster_acquire_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: flat_cluster_acquire_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -1823,11 +2305,15 @@ define amdgpu_kernel void @flat_cluster_acquire_atomicrmw( ; ; GFX11-WGP-LABEL: flat_cluster_acquire_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1838,11 +2324,15 @@ define amdgpu_kernel void @flat_cluster_acquire_atomicrmw( ; ; GFX11-CU-LABEL: flat_cluster_acquire_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1853,11 +2343,15 @@ define amdgpu_kernel void @flat_cluster_acquire_atomicrmw( ; ; GFX12-WGP-LABEL: flat_cluster_acquire_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 @@ -1866,11 +2360,15 @@ define amdgpu_kernel void @flat_cluster_acquire_atomicrmw( ; ; GFX12-CU-LABEL: flat_cluster_acquire_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 @@ -1880,7 +2378,11 @@ define amdgpu_kernel void @flat_cluster_acquire_atomicrmw( ; GFX1250-LABEL: flat_cluster_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -1902,11 +2404,15 @@ define amdgpu_kernel void @flat_cluster_release_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 @@ -1918,11 +2424,15 @@ define amdgpu_kernel void @flat_cluster_release_atomicrmw( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1935,11 +2445,15 @@ define amdgpu_kernel void @flat_cluster_release_atomicrmw( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1948,11 +2462,15 @@ define amdgpu_kernel void @flat_cluster_release_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_cluster_release_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 @@ -1962,10 +2480,14 @@ define amdgpu_kernel void @flat_cluster_release_atomicrmw( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 @@ -1975,10 +2497,14 @@ define amdgpu_kernel void @flat_cluster_release_atomicrmw( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 @@ -1986,10 +2512,14 @@ define amdgpu_kernel void @flat_cluster_release_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_release_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1998,10 +2528,14 @@ define amdgpu_kernel void @flat_cluster_release_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: flat_cluster_release_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2010,11 +2544,15 @@ define amdgpu_kernel void @flat_cluster_release_atomicrmw( ; ; GFX11-WGP-LABEL: flat_cluster_release_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2023,11 +2561,15 @@ define amdgpu_kernel void @flat_cluster_release_atomicrmw( ; ; GFX11-CU-LABEL: flat_cluster_release_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2036,11 +2578,15 @@ define amdgpu_kernel void @flat_cluster_release_atomicrmw( ; ; GFX12-WGP-LABEL: flat_cluster_release_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 @@ -2051,11 +2597,15 @@ define amdgpu_kernel void @flat_cluster_release_atomicrmw( ; ; GFX12-CU-LABEL: flat_cluster_release_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 @@ -2067,7 +2617,11 @@ define amdgpu_kernel void @flat_cluster_release_atomicrmw( ; GFX1250-LABEL: flat_cluster_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2089,11 +2643,15 @@ define amdgpu_kernel void @flat_cluster_acq_rel_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 @@ -2107,11 +2665,15 @@ define amdgpu_kernel void @flat_cluster_acq_rel_atomicrmw( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2128,11 +2690,15 @@ define amdgpu_kernel void @flat_cluster_acq_rel_atomicrmw( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2145,11 +2711,15 @@ define amdgpu_kernel void @flat_cluster_acq_rel_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_cluster_acq_rel_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 @@ -2160,10 +2730,14 @@ define amdgpu_kernel void @flat_cluster_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 @@ -2175,10 +2749,14 @@ define amdgpu_kernel void @flat_cluster_acq_rel_atomicrmw( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 @@ -2188,10 +2766,14 @@ define amdgpu_kernel void @flat_cluster_acq_rel_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_acq_rel_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2202,10 +2784,14 @@ define amdgpu_kernel void @flat_cluster_acq_rel_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: flat_cluster_acq_rel_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2216,11 +2802,15 @@ define amdgpu_kernel void @flat_cluster_acq_rel_atomicrmw( ; ; GFX11-WGP-LABEL: flat_cluster_acq_rel_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2233,11 +2823,15 @@ define amdgpu_kernel void @flat_cluster_acq_rel_atomicrmw( ; ; GFX11-CU-LABEL: flat_cluster_acq_rel_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2250,11 +2844,15 @@ define amdgpu_kernel void @flat_cluster_acq_rel_atomicrmw( ; ; GFX12-WGP-LABEL: flat_cluster_acq_rel_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 @@ -2267,11 +2865,15 @@ define amdgpu_kernel void @flat_cluster_acq_rel_atomicrmw( ; ; GFX12-CU-LABEL: flat_cluster_acq_rel_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 @@ -2285,7 +2887,11 @@ define amdgpu_kernel void @flat_cluster_acq_rel_atomicrmw( ; GFX1250-LABEL: flat_cluster_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2309,11 +2915,15 @@ define amdgpu_kernel void @flat_cluster_seq_cst_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 @@ -2327,11 +2937,15 @@ define amdgpu_kernel void @flat_cluster_seq_cst_atomicrmw( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2348,11 +2962,15 @@ define amdgpu_kernel void @flat_cluster_seq_cst_atomicrmw( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2365,11 +2983,15 @@ define amdgpu_kernel void @flat_cluster_seq_cst_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_cluster_seq_cst_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 @@ -2380,10 +3002,14 @@ define amdgpu_kernel void @flat_cluster_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 @@ -2395,10 +3021,14 @@ define amdgpu_kernel void @flat_cluster_seq_cst_atomicrmw( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 @@ -2408,10 +3038,14 @@ define amdgpu_kernel void @flat_cluster_seq_cst_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_seq_cst_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2422,10 +3056,14 @@ define amdgpu_kernel void @flat_cluster_seq_cst_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: flat_cluster_seq_cst_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2436,11 +3074,15 @@ define amdgpu_kernel void @flat_cluster_seq_cst_atomicrmw( ; ; GFX11-WGP-LABEL: flat_cluster_seq_cst_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2453,11 +3095,15 @@ define amdgpu_kernel void @flat_cluster_seq_cst_atomicrmw( ; ; GFX11-CU-LABEL: flat_cluster_seq_cst_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2470,11 +3116,15 @@ define amdgpu_kernel void @flat_cluster_seq_cst_atomicrmw( ; ; GFX12-WGP-LABEL: flat_cluster_seq_cst_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 @@ -2487,11 +3137,15 @@ define amdgpu_kernel void @flat_cluster_seq_cst_atomicrmw( ; ; GFX12-CU-LABEL: flat_cluster_seq_cst_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 @@ -2505,7 +3159,11 @@ define amdgpu_kernel void @flat_cluster_seq_cst_atomicrmw( ; GFX1250-LABEL: flat_cluster_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2530,6 +3188,10 @@ define amdgpu_kernel void @flat_cluster_acquire_ret_atomicrmw( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -2550,6 +3212,10 @@ define amdgpu_kernel void @flat_cluster_acquire_ret_atomicrmw( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -2571,6 +3237,10 @@ define amdgpu_kernel void @flat_cluster_acquire_ret_atomicrmw( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -2588,6 +3258,10 @@ define amdgpu_kernel void @flat_cluster_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: flat_cluster_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -2605,6 +3279,10 @@ define amdgpu_kernel void @flat_cluster_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] @@ -2621,6 +3299,10 @@ define amdgpu_kernel void @flat_cluster_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] @@ -2635,6 +3317,10 @@ define amdgpu_kernel void @flat_cluster_acquire_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_acquire_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -2649,6 +3335,10 @@ define amdgpu_kernel void @flat_cluster_acquire_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: flat_cluster_acquire_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -2663,6 +3353,10 @@ define amdgpu_kernel void @flat_cluster_acquire_ret_atomicrmw( ; GFX11-WGP-LABEL: flat_cluster_acquire_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -2680,6 +3374,10 @@ define amdgpu_kernel void @flat_cluster_acquire_ret_atomicrmw( ; GFX11-CU-LABEL: flat_cluster_acquire_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2697,6 +3395,10 @@ define amdgpu_kernel void @flat_cluster_acquire_ret_atomicrmw( ; GFX12-WGP-LABEL: flat_cluster_acquire_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -2713,6 +3415,10 @@ define amdgpu_kernel void @flat_cluster_acquire_ret_atomicrmw( ; GFX12-CU-LABEL: flat_cluster_acquire_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2729,7 +3435,11 @@ define amdgpu_kernel void @flat_cluster_acquire_ret_atomicrmw( ; GFX1250-LABEL: flat_cluster_acquire_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2754,6 +3464,10 @@ define amdgpu_kernel void @flat_cluster_acq_rel_ret_atomicrmw( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -2775,6 +3489,10 @@ define amdgpu_kernel void @flat_cluster_acq_rel_ret_atomicrmw( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -2798,6 +3516,10 @@ define amdgpu_kernel void @flat_cluster_acq_rel_ret_atomicrmw( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -2817,6 +3539,10 @@ define amdgpu_kernel void @flat_cluster_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: flat_cluster_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -2835,6 +3561,10 @@ define amdgpu_kernel void @flat_cluster_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] @@ -2852,6 +3582,10 @@ define amdgpu_kernel void @flat_cluster_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] @@ -2867,6 +3601,10 @@ define amdgpu_kernel void @flat_cluster_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_acq_rel_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -2883,6 +3621,10 @@ define amdgpu_kernel void @flat_cluster_acq_rel_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: flat_cluster_acq_rel_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -2899,6 +3641,10 @@ define amdgpu_kernel void @flat_cluster_acq_rel_ret_atomicrmw( ; GFX11-WGP-LABEL: flat_cluster_acq_rel_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -2918,6 +3664,10 @@ define amdgpu_kernel void @flat_cluster_acq_rel_ret_atomicrmw( ; GFX11-CU-LABEL: flat_cluster_acq_rel_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2937,6 +3687,10 @@ define amdgpu_kernel void @flat_cluster_acq_rel_ret_atomicrmw( ; GFX12-WGP-LABEL: flat_cluster_acq_rel_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -2957,6 +3711,10 @@ define amdgpu_kernel void @flat_cluster_acq_rel_ret_atomicrmw( ; GFX12-CU-LABEL: flat_cluster_acq_rel_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2977,7 +3735,11 @@ define amdgpu_kernel void @flat_cluster_acq_rel_ret_atomicrmw( ; GFX1250-LABEL: flat_cluster_acq_rel_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -3004,6 +3766,10 @@ define amdgpu_kernel void @flat_cluster_seq_cst_ret_atomicrmw( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -3025,6 +3791,10 @@ define amdgpu_kernel void @flat_cluster_seq_cst_ret_atomicrmw( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -3048,6 +3818,10 @@ define amdgpu_kernel void @flat_cluster_seq_cst_ret_atomicrmw( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -3067,6 +3841,10 @@ define amdgpu_kernel void @flat_cluster_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: flat_cluster_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -3085,6 +3863,10 @@ define amdgpu_kernel void @flat_cluster_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] @@ -3102,6 +3884,10 @@ define amdgpu_kernel void @flat_cluster_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] @@ -3117,6 +3903,10 @@ define amdgpu_kernel void @flat_cluster_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_seq_cst_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -3133,6 +3923,10 @@ define amdgpu_kernel void @flat_cluster_seq_cst_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: flat_cluster_seq_cst_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -3149,6 +3943,10 @@ define amdgpu_kernel void @flat_cluster_seq_cst_ret_atomicrmw( ; GFX11-WGP-LABEL: flat_cluster_seq_cst_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -3168,6 +3966,10 @@ define amdgpu_kernel void @flat_cluster_seq_cst_ret_atomicrmw( ; GFX11-CU-LABEL: flat_cluster_seq_cst_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -3187,6 +3989,10 @@ define amdgpu_kernel void @flat_cluster_seq_cst_ret_atomicrmw( ; GFX12-WGP-LABEL: flat_cluster_seq_cst_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -3207,6 +4013,10 @@ define amdgpu_kernel void @flat_cluster_seq_cst_ret_atomicrmw( ; GFX12-CU-LABEL: flat_cluster_seq_cst_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -3227,7 +4037,11 @@ define amdgpu_kernel void @flat_cluster_seq_cst_ret_atomicrmw( ; GFX1250-LABEL: flat_cluster_seq_cst_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -3254,11 +4068,16 @@ define amdgpu_kernel void @flat_cluster_monotonic_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -3268,6 +4087,7 @@ define amdgpu_kernel void @flat_cluster_monotonic_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -3283,11 +4103,16 @@ define amdgpu_kernel void @flat_cluster_monotonic_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -3297,6 +4122,7 @@ define amdgpu_kernel void @flat_cluster_monotonic_monotonic_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -3312,11 +4138,16 @@ define amdgpu_kernel void @flat_cluster_monotonic_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -3326,6 +4157,7 @@ define amdgpu_kernel void @flat_cluster_monotonic_monotonic_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -3337,11 +4169,16 @@ define amdgpu_kernel void @flat_cluster_monotonic_monotonic_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_cluster_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -3351,6 +4188,7 @@ define amdgpu_kernel void @flat_cluster_monotonic_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -3364,6 +4202,12 @@ define amdgpu_kernel void @flat_cluster_monotonic_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3380,6 +4224,12 @@ define amdgpu_kernel void @flat_cluster_monotonic_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3394,6 +4244,12 @@ define amdgpu_kernel void @flat_cluster_monotonic_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_monotonic_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3408,6 +4264,12 @@ define amdgpu_kernel void @flat_cluster_monotonic_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_cluster_monotonic_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3422,6 +4284,12 @@ define amdgpu_kernel void @flat_cluster_monotonic_monotonic_cmpxchg( ; GFX11-WGP-LABEL: flat_cluster_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -3437,6 +4305,12 @@ define amdgpu_kernel void @flat_cluster_monotonic_monotonic_cmpxchg( ; GFX11-CU-LABEL: flat_cluster_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -3452,6 +4326,12 @@ define amdgpu_kernel void @flat_cluster_monotonic_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_cluster_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -3467,6 +4347,12 @@ define amdgpu_kernel void @flat_cluster_monotonic_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_cluster_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -3482,7 +4368,13 @@ define amdgpu_kernel void @flat_cluster_monotonic_monotonic_cmpxchg( ; GFX1250-LABEL: flat_cluster_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -3508,11 +4400,16 @@ define amdgpu_kernel void @flat_cluster_acquire_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -3522,6 +4419,7 @@ define amdgpu_kernel void @flat_cluster_acquire_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -3539,11 +4437,16 @@ define amdgpu_kernel void @flat_cluster_acquire_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -3553,6 +4456,7 @@ define amdgpu_kernel void @flat_cluster_acquire_monotonic_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -3572,11 +4476,16 @@ define amdgpu_kernel void @flat_cluster_acquire_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -3586,6 +4495,7 @@ define amdgpu_kernel void @flat_cluster_acquire_monotonic_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -3601,11 +4511,16 @@ define amdgpu_kernel void @flat_cluster_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_cluster_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -3615,6 +4530,7 @@ define amdgpu_kernel void @flat_cluster_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -3629,6 +4545,12 @@ define amdgpu_kernel void @flat_cluster_acquire_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3647,6 +4569,12 @@ define amdgpu_kernel void @flat_cluster_acquire_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3663,6 +4591,12 @@ define amdgpu_kernel void @flat_cluster_acquire_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_acquire_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3679,6 +4613,12 @@ define amdgpu_kernel void @flat_cluster_acquire_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_cluster_acquire_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3695,6 +4635,12 @@ define amdgpu_kernel void @flat_cluster_acquire_monotonic_cmpxchg( ; GFX11-WGP-LABEL: flat_cluster_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -3714,6 +4660,12 @@ define amdgpu_kernel void @flat_cluster_acquire_monotonic_cmpxchg( ; GFX11-CU-LABEL: flat_cluster_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -3733,6 +4685,12 @@ define amdgpu_kernel void @flat_cluster_acquire_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_cluster_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -3750,6 +4708,12 @@ define amdgpu_kernel void @flat_cluster_acquire_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_cluster_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -3767,7 +4731,13 @@ define amdgpu_kernel void @flat_cluster_acquire_monotonic_cmpxchg( ; GFX1250-LABEL: flat_cluster_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -3795,11 +4765,16 @@ define amdgpu_kernel void @flat_cluster_release_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -3809,6 +4784,7 @@ define amdgpu_kernel void @flat_cluster_release_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -3825,11 +4801,16 @@ define amdgpu_kernel void @flat_cluster_release_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -3839,6 +4820,7 @@ define amdgpu_kernel void @flat_cluster_release_monotonic_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -3856,11 +4838,16 @@ define amdgpu_kernel void @flat_cluster_release_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -3870,6 +4857,7 @@ define amdgpu_kernel void @flat_cluster_release_monotonic_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -3883,11 +4871,16 @@ define amdgpu_kernel void @flat_cluster_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_cluster_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -3897,6 +4890,7 @@ define amdgpu_kernel void @flat_cluster_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -3911,6 +4905,12 @@ define amdgpu_kernel void @flat_cluster_release_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3928,6 +4928,12 @@ define amdgpu_kernel void @flat_cluster_release_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3943,6 +4949,12 @@ define amdgpu_kernel void @flat_cluster_release_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_release_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3959,6 +4971,12 @@ define amdgpu_kernel void @flat_cluster_release_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_cluster_release_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3975,6 +4993,12 @@ define amdgpu_kernel void @flat_cluster_release_monotonic_cmpxchg( ; GFX11-WGP-LABEL: flat_cluster_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -3992,6 +5016,12 @@ define amdgpu_kernel void @flat_cluster_release_monotonic_cmpxchg( ; GFX11-CU-LABEL: flat_cluster_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4009,6 +5039,12 @@ define amdgpu_kernel void @flat_cluster_release_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_cluster_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -4028,6 +5064,12 @@ define amdgpu_kernel void @flat_cluster_release_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_cluster_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -4047,7 +5089,13 @@ define amdgpu_kernel void @flat_cluster_release_monotonic_cmpxchg( ; GFX1250-LABEL: flat_cluster_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -4075,11 +5123,16 @@ define amdgpu_kernel void @flat_cluster_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -4089,6 +5142,7 @@ define amdgpu_kernel void @flat_cluster_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -4107,11 +5161,16 @@ define amdgpu_kernel void @flat_cluster_acq_rel_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -4121,6 +5180,7 @@ define amdgpu_kernel void @flat_cluster_acq_rel_monotonic_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -4142,11 +5202,16 @@ define amdgpu_kernel void @flat_cluster_acq_rel_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -4156,6 +5221,7 @@ define amdgpu_kernel void @flat_cluster_acq_rel_monotonic_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -4173,11 +5239,16 @@ define amdgpu_kernel void @flat_cluster_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_cluster_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -4187,6 +5258,7 @@ define amdgpu_kernel void @flat_cluster_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -4202,6 +5274,12 @@ define amdgpu_kernel void @flat_cluster_acq_rel_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4221,6 +5299,12 @@ define amdgpu_kernel void @flat_cluster_acq_rel_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4238,6 +5322,12 @@ define amdgpu_kernel void @flat_cluster_acq_rel_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_acq_rel_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4256,6 +5346,12 @@ define amdgpu_kernel void @flat_cluster_acq_rel_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_cluster_acq_rel_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4274,6 +5370,12 @@ define amdgpu_kernel void @flat_cluster_acq_rel_monotonic_cmpxchg( ; GFX11-WGP-LABEL: flat_cluster_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4295,6 +5397,12 @@ define amdgpu_kernel void @flat_cluster_acq_rel_monotonic_cmpxchg( ; GFX11-CU-LABEL: flat_cluster_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4316,6 +5424,12 @@ define amdgpu_kernel void @flat_cluster_acq_rel_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_cluster_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -4337,6 +5451,12 @@ define amdgpu_kernel void @flat_cluster_acq_rel_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_cluster_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -4358,7 +5478,13 @@ define amdgpu_kernel void @flat_cluster_acq_rel_monotonic_cmpxchg( ; GFX1250-LABEL: flat_cluster_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -4388,11 +5514,16 @@ define amdgpu_kernel void @flat_cluster_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -4402,6 +5533,7 @@ define amdgpu_kernel void @flat_cluster_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -4420,11 +5552,16 @@ define amdgpu_kernel void @flat_cluster_seq_cst_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -4434,6 +5571,7 @@ define amdgpu_kernel void @flat_cluster_seq_cst_monotonic_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -4455,11 +5593,16 @@ define amdgpu_kernel void @flat_cluster_seq_cst_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -4469,6 +5612,7 @@ define amdgpu_kernel void @flat_cluster_seq_cst_monotonic_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -4486,11 +5630,16 @@ define amdgpu_kernel void @flat_cluster_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_cluster_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -4500,6 +5649,7 @@ define amdgpu_kernel void @flat_cluster_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -4515,6 +5665,12 @@ define amdgpu_kernel void @flat_cluster_seq_cst_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4534,6 +5690,12 @@ define amdgpu_kernel void @flat_cluster_seq_cst_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4551,6 +5713,12 @@ define amdgpu_kernel void @flat_cluster_seq_cst_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_seq_cst_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4569,6 +5737,12 @@ define amdgpu_kernel void @flat_cluster_seq_cst_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_cluster_seq_cst_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4587,6 +5761,12 @@ define amdgpu_kernel void @flat_cluster_seq_cst_monotonic_cmpxchg( ; GFX11-WGP-LABEL: flat_cluster_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4608,6 +5788,12 @@ define amdgpu_kernel void @flat_cluster_seq_cst_monotonic_cmpxchg( ; GFX11-CU-LABEL: flat_cluster_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4629,6 +5815,12 @@ define amdgpu_kernel void @flat_cluster_seq_cst_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_cluster_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -4650,6 +5842,12 @@ define amdgpu_kernel void @flat_cluster_seq_cst_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_cluster_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -4671,7 +5869,13 @@ define amdgpu_kernel void @flat_cluster_seq_cst_monotonic_cmpxchg( ; GFX1250-LABEL: flat_cluster_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -4701,11 +5905,16 @@ define amdgpu_kernel void @flat_cluster_monotonic_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -4715,6 +5924,7 @@ define amdgpu_kernel void @flat_cluster_monotonic_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -4732,11 +5942,16 @@ define amdgpu_kernel void @flat_cluster_monotonic_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -4746,6 +5961,7 @@ define amdgpu_kernel void @flat_cluster_monotonic_acquire_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -4765,11 +5981,16 @@ define amdgpu_kernel void @flat_cluster_monotonic_acquire_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -4779,6 +6000,7 @@ define amdgpu_kernel void @flat_cluster_monotonic_acquire_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -4794,11 +6016,16 @@ define amdgpu_kernel void @flat_cluster_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_cluster_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -4808,6 +6035,7 @@ define amdgpu_kernel void @flat_cluster_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -4822,6 +6050,12 @@ define amdgpu_kernel void @flat_cluster_monotonic_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4840,6 +6074,12 @@ define amdgpu_kernel void @flat_cluster_monotonic_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4856,6 +6096,12 @@ define amdgpu_kernel void @flat_cluster_monotonic_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_monotonic_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4872,6 +6118,12 @@ define amdgpu_kernel void @flat_cluster_monotonic_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_cluster_monotonic_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4888,6 +6140,12 @@ define amdgpu_kernel void @flat_cluster_monotonic_acquire_cmpxchg( ; GFX11-WGP-LABEL: flat_cluster_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4907,6 +6165,12 @@ define amdgpu_kernel void @flat_cluster_monotonic_acquire_cmpxchg( ; GFX11-CU-LABEL: flat_cluster_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4926,6 +6190,12 @@ define amdgpu_kernel void @flat_cluster_monotonic_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_cluster_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -4943,6 +6213,12 @@ define amdgpu_kernel void @flat_cluster_monotonic_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_cluster_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -4960,7 +6236,13 @@ define amdgpu_kernel void @flat_cluster_monotonic_acquire_cmpxchg( ; GFX1250-LABEL: flat_cluster_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -4988,11 +6270,16 @@ define amdgpu_kernel void @flat_cluster_acquire_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -5002,6 +6289,7 @@ define amdgpu_kernel void @flat_cluster_acquire_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -5019,11 +6307,16 @@ define amdgpu_kernel void @flat_cluster_acquire_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -5033,6 +6326,7 @@ define amdgpu_kernel void @flat_cluster_acquire_acquire_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -5052,11 +6346,16 @@ define amdgpu_kernel void @flat_cluster_acquire_acquire_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -5066,6 +6365,7 @@ define amdgpu_kernel void @flat_cluster_acquire_acquire_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -5081,11 +6381,16 @@ define amdgpu_kernel void @flat_cluster_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_cluster_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -5095,6 +6400,7 @@ define amdgpu_kernel void @flat_cluster_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -5109,6 +6415,12 @@ define amdgpu_kernel void @flat_cluster_acquire_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5127,6 +6439,12 @@ define amdgpu_kernel void @flat_cluster_acquire_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5143,6 +6461,12 @@ define amdgpu_kernel void @flat_cluster_acquire_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_acquire_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5159,6 +6483,12 @@ define amdgpu_kernel void @flat_cluster_acquire_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_cluster_acquire_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5175,6 +6505,12 @@ define amdgpu_kernel void @flat_cluster_acquire_acquire_cmpxchg( ; GFX11-WGP-LABEL: flat_cluster_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5194,6 +6530,12 @@ define amdgpu_kernel void @flat_cluster_acquire_acquire_cmpxchg( ; GFX11-CU-LABEL: flat_cluster_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5213,6 +6555,12 @@ define amdgpu_kernel void @flat_cluster_acquire_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_cluster_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -5230,6 +6578,12 @@ define amdgpu_kernel void @flat_cluster_acquire_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_cluster_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -5247,7 +6601,13 @@ define amdgpu_kernel void @flat_cluster_acquire_acquire_cmpxchg( ; GFX1250-LABEL: flat_cluster_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -5275,11 +6635,16 @@ define amdgpu_kernel void @flat_cluster_release_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -5289,6 +6654,7 @@ define amdgpu_kernel void @flat_cluster_release_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -5307,11 +6673,16 @@ define amdgpu_kernel void @flat_cluster_release_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -5321,6 +6692,7 @@ define amdgpu_kernel void @flat_cluster_release_acquire_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -5342,11 +6714,16 @@ define amdgpu_kernel void @flat_cluster_release_acquire_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -5356,6 +6733,7 @@ define amdgpu_kernel void @flat_cluster_release_acquire_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -5373,11 +6751,16 @@ define amdgpu_kernel void @flat_cluster_release_acquire_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_cluster_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -5387,6 +6770,7 @@ define amdgpu_kernel void @flat_cluster_release_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -5402,6 +6786,12 @@ define amdgpu_kernel void @flat_cluster_release_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5421,6 +6811,12 @@ define amdgpu_kernel void @flat_cluster_release_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5438,6 +6834,12 @@ define amdgpu_kernel void @flat_cluster_release_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_release_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5456,6 +6858,12 @@ define amdgpu_kernel void @flat_cluster_release_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_cluster_release_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5474,6 +6882,12 @@ define amdgpu_kernel void @flat_cluster_release_acquire_cmpxchg( ; GFX11-WGP-LABEL: flat_cluster_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5495,6 +6909,12 @@ define amdgpu_kernel void @flat_cluster_release_acquire_cmpxchg( ; GFX11-CU-LABEL: flat_cluster_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5516,6 +6936,12 @@ define amdgpu_kernel void @flat_cluster_release_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_cluster_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -5537,6 +6963,12 @@ define amdgpu_kernel void @flat_cluster_release_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_cluster_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -5558,7 +6990,13 @@ define amdgpu_kernel void @flat_cluster_release_acquire_cmpxchg( ; GFX1250-LABEL: flat_cluster_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -5588,11 +7026,16 @@ define amdgpu_kernel void @flat_cluster_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -5602,6 +7045,7 @@ define amdgpu_kernel void @flat_cluster_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -5620,11 +7064,16 @@ define amdgpu_kernel void @flat_cluster_acq_rel_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -5634,6 +7083,7 @@ define amdgpu_kernel void @flat_cluster_acq_rel_acquire_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -5655,11 +7105,16 @@ define amdgpu_kernel void @flat_cluster_acq_rel_acquire_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -5669,6 +7124,7 @@ define amdgpu_kernel void @flat_cluster_acq_rel_acquire_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -5686,11 +7142,16 @@ define amdgpu_kernel void @flat_cluster_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_cluster_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -5700,6 +7161,7 @@ define amdgpu_kernel void @flat_cluster_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -5715,6 +7177,12 @@ define amdgpu_kernel void @flat_cluster_acq_rel_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5734,6 +7202,12 @@ define amdgpu_kernel void @flat_cluster_acq_rel_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5751,6 +7225,12 @@ define amdgpu_kernel void @flat_cluster_acq_rel_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_acq_rel_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5769,6 +7249,12 @@ define amdgpu_kernel void @flat_cluster_acq_rel_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_cluster_acq_rel_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5787,6 +7273,12 @@ define amdgpu_kernel void @flat_cluster_acq_rel_acquire_cmpxchg( ; GFX11-WGP-LABEL: flat_cluster_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5808,6 +7300,12 @@ define amdgpu_kernel void @flat_cluster_acq_rel_acquire_cmpxchg( ; GFX11-CU-LABEL: flat_cluster_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5829,6 +7327,12 @@ define amdgpu_kernel void @flat_cluster_acq_rel_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_cluster_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -5850,6 +7354,12 @@ define amdgpu_kernel void @flat_cluster_acq_rel_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_cluster_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -5871,7 +7381,13 @@ define amdgpu_kernel void @flat_cluster_acq_rel_acquire_cmpxchg( ; GFX1250-LABEL: flat_cluster_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -5901,11 +7417,16 @@ define amdgpu_kernel void @flat_cluster_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -5915,6 +7436,7 @@ define amdgpu_kernel void @flat_cluster_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -5933,11 +7455,16 @@ define amdgpu_kernel void @flat_cluster_seq_cst_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -5947,6 +7474,7 @@ define amdgpu_kernel void @flat_cluster_seq_cst_acquire_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -5968,11 +7496,16 @@ define amdgpu_kernel void @flat_cluster_seq_cst_acquire_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -5982,6 +7515,7 @@ define amdgpu_kernel void @flat_cluster_seq_cst_acquire_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -5999,11 +7533,16 @@ define amdgpu_kernel void @flat_cluster_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_cluster_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -6013,6 +7552,7 @@ define amdgpu_kernel void @flat_cluster_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -6028,6 +7568,12 @@ define amdgpu_kernel void @flat_cluster_seq_cst_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6047,6 +7593,12 @@ define amdgpu_kernel void @flat_cluster_seq_cst_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6064,6 +7616,12 @@ define amdgpu_kernel void @flat_cluster_seq_cst_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_seq_cst_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6082,6 +7640,12 @@ define amdgpu_kernel void @flat_cluster_seq_cst_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_cluster_seq_cst_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6100,6 +7664,12 @@ define amdgpu_kernel void @flat_cluster_seq_cst_acquire_cmpxchg( ; GFX11-WGP-LABEL: flat_cluster_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -6121,6 +7691,12 @@ define amdgpu_kernel void @flat_cluster_seq_cst_acquire_cmpxchg( ; GFX11-CU-LABEL: flat_cluster_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6142,6 +7718,12 @@ define amdgpu_kernel void @flat_cluster_seq_cst_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_cluster_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -6163,6 +7745,12 @@ define amdgpu_kernel void @flat_cluster_seq_cst_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_cluster_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -6184,7 +7772,13 @@ define amdgpu_kernel void @flat_cluster_seq_cst_acquire_cmpxchg( ; GFX1250-LABEL: flat_cluster_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -6214,11 +7808,16 @@ define amdgpu_kernel void @flat_cluster_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -6228,6 +7827,7 @@ define amdgpu_kernel void @flat_cluster_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -6246,11 +7846,16 @@ define amdgpu_kernel void @flat_cluster_monotonic_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -6260,6 +7865,7 @@ define amdgpu_kernel void @flat_cluster_monotonic_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -6281,11 +7887,16 @@ define amdgpu_kernel void @flat_cluster_monotonic_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -6295,6 +7906,7 @@ define amdgpu_kernel void @flat_cluster_monotonic_seq_cst_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -6312,11 +7924,16 @@ define amdgpu_kernel void @flat_cluster_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_cluster_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -6326,6 +7943,7 @@ define amdgpu_kernel void @flat_cluster_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -6341,6 +7959,12 @@ define amdgpu_kernel void @flat_cluster_monotonic_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6360,6 +7984,12 @@ define amdgpu_kernel void @flat_cluster_monotonic_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6377,6 +8007,12 @@ define amdgpu_kernel void @flat_cluster_monotonic_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_monotonic_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6395,6 +8031,12 @@ define amdgpu_kernel void @flat_cluster_monotonic_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_cluster_monotonic_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6413,6 +8055,12 @@ define amdgpu_kernel void @flat_cluster_monotonic_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: flat_cluster_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -6434,6 +8082,12 @@ define amdgpu_kernel void @flat_cluster_monotonic_seq_cst_cmpxchg( ; GFX11-CU-LABEL: flat_cluster_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6455,6 +8109,12 @@ define amdgpu_kernel void @flat_cluster_monotonic_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_cluster_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -6476,6 +8136,12 @@ define amdgpu_kernel void @flat_cluster_monotonic_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_cluster_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -6497,7 +8163,13 @@ define amdgpu_kernel void @flat_cluster_monotonic_seq_cst_cmpxchg( ; GFX1250-LABEL: flat_cluster_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -6527,11 +8199,16 @@ define amdgpu_kernel void @flat_cluster_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -6541,6 +8218,7 @@ define amdgpu_kernel void @flat_cluster_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -6559,11 +8237,16 @@ define amdgpu_kernel void @flat_cluster_acquire_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -6573,6 +8256,7 @@ define amdgpu_kernel void @flat_cluster_acquire_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -6594,11 +8278,16 @@ define amdgpu_kernel void @flat_cluster_acquire_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -6608,6 +8297,7 @@ define amdgpu_kernel void @flat_cluster_acquire_seq_cst_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -6625,11 +8315,16 @@ define amdgpu_kernel void @flat_cluster_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_cluster_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -6639,6 +8334,7 @@ define amdgpu_kernel void @flat_cluster_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -6654,6 +8350,12 @@ define amdgpu_kernel void @flat_cluster_acquire_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6673,6 +8375,12 @@ define amdgpu_kernel void @flat_cluster_acquire_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6690,6 +8398,12 @@ define amdgpu_kernel void @flat_cluster_acquire_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_acquire_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6708,6 +8422,12 @@ define amdgpu_kernel void @flat_cluster_acquire_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_cluster_acquire_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6726,6 +8446,12 @@ define amdgpu_kernel void @flat_cluster_acquire_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: flat_cluster_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -6747,6 +8473,12 @@ define amdgpu_kernel void @flat_cluster_acquire_seq_cst_cmpxchg( ; GFX11-CU-LABEL: flat_cluster_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6768,6 +8500,12 @@ define amdgpu_kernel void @flat_cluster_acquire_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_cluster_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -6789,6 +8527,12 @@ define amdgpu_kernel void @flat_cluster_acquire_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_cluster_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -6810,7 +8554,13 @@ define amdgpu_kernel void @flat_cluster_acquire_seq_cst_cmpxchg( ; GFX1250-LABEL: flat_cluster_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -6840,11 +8590,16 @@ define amdgpu_kernel void @flat_cluster_release_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -6854,6 +8609,7 @@ define amdgpu_kernel void @flat_cluster_release_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -6872,11 +8628,16 @@ define amdgpu_kernel void @flat_cluster_release_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -6886,6 +8647,7 @@ define amdgpu_kernel void @flat_cluster_release_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -6907,11 +8669,16 @@ define amdgpu_kernel void @flat_cluster_release_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -6921,6 +8688,7 @@ define amdgpu_kernel void @flat_cluster_release_seq_cst_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -6938,11 +8706,16 @@ define amdgpu_kernel void @flat_cluster_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_cluster_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -6952,6 +8725,7 @@ define amdgpu_kernel void @flat_cluster_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -6967,6 +8741,12 @@ define amdgpu_kernel void @flat_cluster_release_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6986,6 +8766,12 @@ define amdgpu_kernel void @flat_cluster_release_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7003,6 +8789,12 @@ define amdgpu_kernel void @flat_cluster_release_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_release_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7021,6 +8813,12 @@ define amdgpu_kernel void @flat_cluster_release_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_cluster_release_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7039,6 +8837,12 @@ define amdgpu_kernel void @flat_cluster_release_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: flat_cluster_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7060,6 +8864,12 @@ define amdgpu_kernel void @flat_cluster_release_seq_cst_cmpxchg( ; GFX11-CU-LABEL: flat_cluster_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -7081,6 +8891,12 @@ define amdgpu_kernel void @flat_cluster_release_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_cluster_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -7102,6 +8918,12 @@ define amdgpu_kernel void @flat_cluster_release_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_cluster_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -7123,7 +8945,13 @@ define amdgpu_kernel void @flat_cluster_release_seq_cst_cmpxchg( ; GFX1250-LABEL: flat_cluster_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -7153,11 +8981,16 @@ define amdgpu_kernel void @flat_cluster_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -7167,6 +9000,7 @@ define amdgpu_kernel void @flat_cluster_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -7185,11 +9019,16 @@ define amdgpu_kernel void @flat_cluster_acq_rel_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -7199,6 +9038,7 @@ define amdgpu_kernel void @flat_cluster_acq_rel_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -7220,11 +9060,16 @@ define amdgpu_kernel void @flat_cluster_acq_rel_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -7234,6 +9079,7 @@ define amdgpu_kernel void @flat_cluster_acq_rel_seq_cst_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -7251,11 +9097,16 @@ define amdgpu_kernel void @flat_cluster_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_cluster_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -7265,6 +9116,7 @@ define amdgpu_kernel void @flat_cluster_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -7280,6 +9132,12 @@ define amdgpu_kernel void @flat_cluster_acq_rel_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7299,6 +9157,12 @@ define amdgpu_kernel void @flat_cluster_acq_rel_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7316,6 +9180,12 @@ define amdgpu_kernel void @flat_cluster_acq_rel_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_acq_rel_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7334,6 +9204,12 @@ define amdgpu_kernel void @flat_cluster_acq_rel_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_cluster_acq_rel_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7352,6 +9228,12 @@ define amdgpu_kernel void @flat_cluster_acq_rel_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: flat_cluster_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7373,6 +9255,12 @@ define amdgpu_kernel void @flat_cluster_acq_rel_seq_cst_cmpxchg( ; GFX11-CU-LABEL: flat_cluster_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -7394,6 +9282,12 @@ define amdgpu_kernel void @flat_cluster_acq_rel_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_cluster_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -7415,6 +9309,12 @@ define amdgpu_kernel void @flat_cluster_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_cluster_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -7436,7 +9336,13 @@ define amdgpu_kernel void @flat_cluster_acq_rel_seq_cst_cmpxchg( ; GFX1250-LABEL: flat_cluster_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -7466,11 +9372,16 @@ define amdgpu_kernel void @flat_cluster_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -7480,6 +9391,7 @@ define amdgpu_kernel void @flat_cluster_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -7498,11 +9410,16 @@ define amdgpu_kernel void @flat_cluster_seq_cst_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -7512,6 +9429,7 @@ define amdgpu_kernel void @flat_cluster_seq_cst_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -7533,11 +9451,16 @@ define amdgpu_kernel void @flat_cluster_seq_cst_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -7547,6 +9470,7 @@ define amdgpu_kernel void @flat_cluster_seq_cst_seq_cst_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -7564,11 +9488,16 @@ define amdgpu_kernel void @flat_cluster_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_cluster_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -7578,6 +9507,7 @@ define amdgpu_kernel void @flat_cluster_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -7593,6 +9523,12 @@ define amdgpu_kernel void @flat_cluster_seq_cst_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7612,6 +9548,12 @@ define amdgpu_kernel void @flat_cluster_seq_cst_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7629,6 +9571,12 @@ define amdgpu_kernel void @flat_cluster_seq_cst_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_seq_cst_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7647,6 +9595,12 @@ define amdgpu_kernel void @flat_cluster_seq_cst_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_cluster_seq_cst_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7665,6 +9619,12 @@ define amdgpu_kernel void @flat_cluster_seq_cst_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: flat_cluster_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7686,6 +9646,12 @@ define amdgpu_kernel void @flat_cluster_seq_cst_seq_cst_cmpxchg( ; GFX11-CU-LABEL: flat_cluster_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -7707,6 +9673,12 @@ define amdgpu_kernel void @flat_cluster_seq_cst_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_cluster_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -7728,6 +9700,12 @@ define amdgpu_kernel void @flat_cluster_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_cluster_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -7749,7 +9727,13 @@ define amdgpu_kernel void @flat_cluster_seq_cst_seq_cst_cmpxchg( ; GFX1250-LABEL: flat_cluster_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -7780,6 +9764,12 @@ define amdgpu_kernel void @flat_cluster_monotonic_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -7813,6 +9803,12 @@ define amdgpu_kernel void @flat_cluster_monotonic_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -7846,6 +9842,12 @@ define amdgpu_kernel void @flat_cluster_monotonic_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -7875,6 +9877,12 @@ define amdgpu_kernel void @flat_cluster_monotonic_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -7905,6 +9913,12 @@ define amdgpu_kernel void @flat_cluster_monotonic_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7924,6 +9938,12 @@ define amdgpu_kernel void @flat_cluster_monotonic_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7941,6 +9961,12 @@ define amdgpu_kernel void @flat_cluster_monotonic_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_monotonic_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7958,6 +9984,12 @@ define amdgpu_kernel void @flat_cluster_monotonic_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_cluster_monotonic_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7975,6 +10007,12 @@ define amdgpu_kernel void @flat_cluster_monotonic_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_cluster_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7994,6 +10032,12 @@ define amdgpu_kernel void @flat_cluster_monotonic_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: flat_cluster_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -8013,6 +10057,12 @@ define amdgpu_kernel void @flat_cluster_monotonic_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_cluster_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -8032,6 +10082,12 @@ define amdgpu_kernel void @flat_cluster_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_cluster_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -8051,7 +10107,13 @@ define amdgpu_kernel void @flat_cluster_monotonic_monotonic_ret_cmpxchg( ; GFX1250-LABEL: flat_cluster_monotonic_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -8082,6 +10144,12 @@ define amdgpu_kernel void @flat_cluster_acquire_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -8116,6 +10184,12 @@ define amdgpu_kernel void @flat_cluster_acquire_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -8151,6 +10225,12 @@ define amdgpu_kernel void @flat_cluster_acquire_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -8182,6 +10262,12 @@ define amdgpu_kernel void @flat_cluster_acquire_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -8212,6 +10298,12 @@ define amdgpu_kernel void @flat_cluster_acquire_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8232,6 +10324,12 @@ define amdgpu_kernel void @flat_cluster_acquire_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8250,6 +10348,12 @@ define amdgpu_kernel void @flat_cluster_acquire_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_acquire_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8268,6 +10372,12 @@ define amdgpu_kernel void @flat_cluster_acquire_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_cluster_acquire_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8286,6 +10396,12 @@ define amdgpu_kernel void @flat_cluster_acquire_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_cluster_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -8307,6 +10423,12 @@ define amdgpu_kernel void @flat_cluster_acquire_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: flat_cluster_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -8328,6 +10450,12 @@ define amdgpu_kernel void @flat_cluster_acquire_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_cluster_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -8348,6 +10476,12 @@ define amdgpu_kernel void @flat_cluster_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_cluster_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -8368,7 +10502,13 @@ define amdgpu_kernel void @flat_cluster_acquire_monotonic_ret_cmpxchg( ; GFX1250-LABEL: flat_cluster_acquire_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -8400,6 +10540,12 @@ define amdgpu_kernel void @flat_cluster_release_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -8434,6 +10580,12 @@ define amdgpu_kernel void @flat_cluster_release_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -8469,6 +10621,12 @@ define amdgpu_kernel void @flat_cluster_release_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -8500,6 +10658,12 @@ define amdgpu_kernel void @flat_cluster_release_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -8531,6 +10695,12 @@ define amdgpu_kernel void @flat_cluster_release_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8551,6 +10721,12 @@ define amdgpu_kernel void @flat_cluster_release_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8569,6 +10745,12 @@ define amdgpu_kernel void @flat_cluster_release_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_release_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8588,6 +10770,12 @@ define amdgpu_kernel void @flat_cluster_release_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_cluster_release_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8607,6 +10795,12 @@ define amdgpu_kernel void @flat_cluster_release_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_cluster_release_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -8628,6 +10822,12 @@ define amdgpu_kernel void @flat_cluster_release_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: flat_cluster_release_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -8649,6 +10849,12 @@ define amdgpu_kernel void @flat_cluster_release_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_cluster_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -8672,6 +10878,12 @@ define amdgpu_kernel void @flat_cluster_release_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_cluster_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -8695,7 +10907,13 @@ define amdgpu_kernel void @flat_cluster_release_monotonic_ret_cmpxchg( ; GFX1250-LABEL: flat_cluster_release_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -8728,6 +10946,12 @@ define amdgpu_kernel void @flat_cluster_acq_rel_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -8763,6 +10987,12 @@ define amdgpu_kernel void @flat_cluster_acq_rel_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -8800,6 +11030,12 @@ define amdgpu_kernel void @flat_cluster_acq_rel_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -8833,6 +11069,12 @@ define amdgpu_kernel void @flat_cluster_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -8864,6 +11106,12 @@ define amdgpu_kernel void @flat_cluster_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8885,6 +11133,12 @@ define amdgpu_kernel void @flat_cluster_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8904,6 +11158,12 @@ define amdgpu_kernel void @flat_cluster_acq_rel_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_acq_rel_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8924,6 +11184,12 @@ define amdgpu_kernel void @flat_cluster_acq_rel_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_cluster_acq_rel_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8944,6 +11210,12 @@ define amdgpu_kernel void @flat_cluster_acq_rel_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_cluster_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -8967,6 +11239,12 @@ define amdgpu_kernel void @flat_cluster_acq_rel_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: flat_cluster_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -8990,6 +11268,12 @@ define amdgpu_kernel void @flat_cluster_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_cluster_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -9014,6 +11298,12 @@ define amdgpu_kernel void @flat_cluster_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_cluster_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -9038,7 +11328,13 @@ define amdgpu_kernel void @flat_cluster_acq_rel_monotonic_ret_cmpxchg( ; GFX1250-LABEL: flat_cluster_acq_rel_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -9072,6 +11368,12 @@ define amdgpu_kernel void @flat_cluster_seq_cst_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -9107,6 +11409,12 @@ define amdgpu_kernel void @flat_cluster_seq_cst_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -9144,6 +11452,12 @@ define amdgpu_kernel void @flat_cluster_seq_cst_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -9177,6 +11491,12 @@ define amdgpu_kernel void @flat_cluster_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -9208,6 +11528,12 @@ define amdgpu_kernel void @flat_cluster_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9229,6 +11555,12 @@ define amdgpu_kernel void @flat_cluster_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9248,6 +11580,12 @@ define amdgpu_kernel void @flat_cluster_seq_cst_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_seq_cst_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9268,6 +11606,12 @@ define amdgpu_kernel void @flat_cluster_seq_cst_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_cluster_seq_cst_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9288,6 +11632,12 @@ define amdgpu_kernel void @flat_cluster_seq_cst_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_cluster_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -9311,6 +11661,12 @@ define amdgpu_kernel void @flat_cluster_seq_cst_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: flat_cluster_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -9334,6 +11690,12 @@ define amdgpu_kernel void @flat_cluster_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_cluster_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -9358,6 +11720,12 @@ define amdgpu_kernel void @flat_cluster_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_cluster_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -9382,7 +11750,13 @@ define amdgpu_kernel void @flat_cluster_seq_cst_monotonic_ret_cmpxchg( ; GFX1250-LABEL: flat_cluster_seq_cst_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -9416,6 +11790,12 @@ define amdgpu_kernel void @flat_cluster_monotonic_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -9450,6 +11830,12 @@ define amdgpu_kernel void @flat_cluster_monotonic_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -9485,6 +11871,12 @@ define amdgpu_kernel void @flat_cluster_monotonic_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -9516,6 +11908,12 @@ define amdgpu_kernel void @flat_cluster_monotonic_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -9546,6 +11944,12 @@ define amdgpu_kernel void @flat_cluster_monotonic_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9566,6 +11970,12 @@ define amdgpu_kernel void @flat_cluster_monotonic_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9584,6 +11994,12 @@ define amdgpu_kernel void @flat_cluster_monotonic_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_monotonic_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9602,6 +12018,12 @@ define amdgpu_kernel void @flat_cluster_monotonic_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_cluster_monotonic_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9620,6 +12042,12 @@ define amdgpu_kernel void @flat_cluster_monotonic_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_cluster_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -9641,6 +12069,12 @@ define amdgpu_kernel void @flat_cluster_monotonic_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: flat_cluster_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -9662,6 +12096,12 @@ define amdgpu_kernel void @flat_cluster_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_cluster_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -9682,6 +12122,12 @@ define amdgpu_kernel void @flat_cluster_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_cluster_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -9702,7 +12148,13 @@ define amdgpu_kernel void @flat_cluster_monotonic_acquire_ret_cmpxchg( ; GFX1250-LABEL: flat_cluster_monotonic_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -9734,6 +12186,12 @@ define amdgpu_kernel void @flat_cluster_acquire_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -9768,6 +12226,12 @@ define amdgpu_kernel void @flat_cluster_acquire_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -9803,6 +12267,12 @@ define amdgpu_kernel void @flat_cluster_acquire_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -9834,6 +12304,12 @@ define amdgpu_kernel void @flat_cluster_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -9864,6 +12340,12 @@ define amdgpu_kernel void @flat_cluster_acquire_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9884,6 +12366,12 @@ define amdgpu_kernel void @flat_cluster_acquire_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9902,6 +12390,12 @@ define amdgpu_kernel void @flat_cluster_acquire_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_acquire_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9920,6 +12414,12 @@ define amdgpu_kernel void @flat_cluster_acquire_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_cluster_acquire_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9938,6 +12438,12 @@ define amdgpu_kernel void @flat_cluster_acquire_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_cluster_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -9959,6 +12465,12 @@ define amdgpu_kernel void @flat_cluster_acquire_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: flat_cluster_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -9980,6 +12492,12 @@ define amdgpu_kernel void @flat_cluster_acquire_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_cluster_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -10000,6 +12518,12 @@ define amdgpu_kernel void @flat_cluster_acquire_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_cluster_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -10020,7 +12544,13 @@ define amdgpu_kernel void @flat_cluster_acquire_acquire_ret_cmpxchg( ; GFX1250-LABEL: flat_cluster_acquire_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -10052,6 +12582,12 @@ define amdgpu_kernel void @flat_cluster_release_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -10087,6 +12623,12 @@ define amdgpu_kernel void @flat_cluster_release_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -10124,6 +12666,12 @@ define amdgpu_kernel void @flat_cluster_release_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -10157,6 +12705,12 @@ define amdgpu_kernel void @flat_cluster_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -10188,6 +12742,12 @@ define amdgpu_kernel void @flat_cluster_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10209,6 +12769,12 @@ define amdgpu_kernel void @flat_cluster_release_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10228,6 +12794,12 @@ define amdgpu_kernel void @flat_cluster_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_release_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10248,6 +12820,12 @@ define amdgpu_kernel void @flat_cluster_release_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_cluster_release_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10268,6 +12846,12 @@ define amdgpu_kernel void @flat_cluster_release_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_cluster_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10291,6 +12875,12 @@ define amdgpu_kernel void @flat_cluster_release_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: flat_cluster_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10314,6 +12904,12 @@ define amdgpu_kernel void @flat_cluster_release_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_cluster_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -10338,6 +12934,12 @@ define amdgpu_kernel void @flat_cluster_release_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_cluster_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -10362,7 +12964,13 @@ define amdgpu_kernel void @flat_cluster_release_acquire_ret_cmpxchg( ; GFX1250-LABEL: flat_cluster_release_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -10396,6 +13004,12 @@ define amdgpu_kernel void @flat_cluster_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -10431,6 +13045,12 @@ define amdgpu_kernel void @flat_cluster_acq_rel_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -10468,6 +13088,12 @@ define amdgpu_kernel void @flat_cluster_acq_rel_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -10501,6 +13127,12 @@ define amdgpu_kernel void @flat_cluster_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -10532,6 +13164,12 @@ define amdgpu_kernel void @flat_cluster_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10553,6 +13191,12 @@ define amdgpu_kernel void @flat_cluster_acq_rel_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10572,6 +13216,12 @@ define amdgpu_kernel void @flat_cluster_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_acq_rel_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10592,6 +13242,12 @@ define amdgpu_kernel void @flat_cluster_acq_rel_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_cluster_acq_rel_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10612,6 +13268,12 @@ define amdgpu_kernel void @flat_cluster_acq_rel_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_cluster_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10635,6 +13297,12 @@ define amdgpu_kernel void @flat_cluster_acq_rel_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: flat_cluster_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10658,6 +13326,12 @@ define amdgpu_kernel void @flat_cluster_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_cluster_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -10682,6 +13356,12 @@ define amdgpu_kernel void @flat_cluster_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_cluster_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -10706,7 +13386,13 @@ define amdgpu_kernel void @flat_cluster_acq_rel_acquire_ret_cmpxchg( ; GFX1250-LABEL: flat_cluster_acq_rel_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -10740,6 +13426,12 @@ define amdgpu_kernel void @flat_cluster_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -10775,6 +13467,12 @@ define amdgpu_kernel void @flat_cluster_seq_cst_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -10812,6 +13510,12 @@ define amdgpu_kernel void @flat_cluster_seq_cst_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -10845,6 +13549,12 @@ define amdgpu_kernel void @flat_cluster_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -10876,6 +13586,12 @@ define amdgpu_kernel void @flat_cluster_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10897,6 +13613,12 @@ define amdgpu_kernel void @flat_cluster_seq_cst_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10916,6 +13638,12 @@ define amdgpu_kernel void @flat_cluster_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_seq_cst_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10936,6 +13664,12 @@ define amdgpu_kernel void @flat_cluster_seq_cst_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_cluster_seq_cst_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10956,6 +13690,12 @@ define amdgpu_kernel void @flat_cluster_seq_cst_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_cluster_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10979,6 +13719,12 @@ define amdgpu_kernel void @flat_cluster_seq_cst_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: flat_cluster_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11002,6 +13748,12 @@ define amdgpu_kernel void @flat_cluster_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_cluster_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -11026,6 +13778,12 @@ define amdgpu_kernel void @flat_cluster_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_cluster_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -11050,7 +13808,13 @@ define amdgpu_kernel void @flat_cluster_seq_cst_acquire_ret_cmpxchg( ; GFX1250-LABEL: flat_cluster_seq_cst_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -11084,6 +13848,12 @@ define amdgpu_kernel void @flat_cluster_monotonic_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -11119,6 +13889,12 @@ define amdgpu_kernel void @flat_cluster_monotonic_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -11156,6 +13932,12 @@ define amdgpu_kernel void @flat_cluster_monotonic_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -11189,6 +13971,12 @@ define amdgpu_kernel void @flat_cluster_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -11220,6 +14008,12 @@ define amdgpu_kernel void @flat_cluster_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11241,6 +14035,12 @@ define amdgpu_kernel void @flat_cluster_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11260,6 +14060,12 @@ define amdgpu_kernel void @flat_cluster_monotonic_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_monotonic_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11280,6 +14086,12 @@ define amdgpu_kernel void @flat_cluster_monotonic_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_cluster_monotonic_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11300,6 +14112,12 @@ define amdgpu_kernel void @flat_cluster_monotonic_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_cluster_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11323,6 +14141,12 @@ define amdgpu_kernel void @flat_cluster_monotonic_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: flat_cluster_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11346,6 +14170,12 @@ define amdgpu_kernel void @flat_cluster_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_cluster_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -11370,6 +14200,12 @@ define amdgpu_kernel void @flat_cluster_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_cluster_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -11394,7 +14230,13 @@ define amdgpu_kernel void @flat_cluster_monotonic_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: flat_cluster_monotonic_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -11428,6 +14270,12 @@ define amdgpu_kernel void @flat_cluster_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -11463,6 +14311,12 @@ define amdgpu_kernel void @flat_cluster_acquire_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -11500,6 +14354,12 @@ define amdgpu_kernel void @flat_cluster_acquire_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -11533,6 +14393,12 @@ define amdgpu_kernel void @flat_cluster_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -11564,6 +14430,12 @@ define amdgpu_kernel void @flat_cluster_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11585,6 +14457,12 @@ define amdgpu_kernel void @flat_cluster_acquire_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11604,6 +14482,12 @@ define amdgpu_kernel void @flat_cluster_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_acquire_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11624,6 +14508,12 @@ define amdgpu_kernel void @flat_cluster_acquire_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_cluster_acquire_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11644,6 +14534,12 @@ define amdgpu_kernel void @flat_cluster_acquire_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_cluster_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11667,6 +14563,12 @@ define amdgpu_kernel void @flat_cluster_acquire_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: flat_cluster_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11690,6 +14592,12 @@ define amdgpu_kernel void @flat_cluster_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_cluster_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -11714,6 +14622,12 @@ define amdgpu_kernel void @flat_cluster_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_cluster_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -11738,7 +14652,13 @@ define amdgpu_kernel void @flat_cluster_acquire_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: flat_cluster_acquire_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -11772,6 +14692,12 @@ define amdgpu_kernel void @flat_cluster_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -11807,6 +14733,12 @@ define amdgpu_kernel void @flat_cluster_release_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -11844,6 +14776,12 @@ define amdgpu_kernel void @flat_cluster_release_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -11877,6 +14815,12 @@ define amdgpu_kernel void @flat_cluster_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -11908,6 +14852,12 @@ define amdgpu_kernel void @flat_cluster_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11929,6 +14879,12 @@ define amdgpu_kernel void @flat_cluster_release_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11948,6 +14904,12 @@ define amdgpu_kernel void @flat_cluster_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_release_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11968,6 +14930,12 @@ define amdgpu_kernel void @flat_cluster_release_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_cluster_release_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11988,6 +14956,12 @@ define amdgpu_kernel void @flat_cluster_release_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_cluster_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12011,6 +14985,12 @@ define amdgpu_kernel void @flat_cluster_release_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: flat_cluster_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12034,6 +15014,12 @@ define amdgpu_kernel void @flat_cluster_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_cluster_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -12058,6 +15044,12 @@ define amdgpu_kernel void @flat_cluster_release_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_cluster_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -12082,7 +15074,13 @@ define amdgpu_kernel void @flat_cluster_release_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: flat_cluster_release_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -12116,6 +15114,12 @@ define amdgpu_kernel void @flat_cluster_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -12151,6 +15155,12 @@ define amdgpu_kernel void @flat_cluster_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -12188,6 +15198,12 @@ define amdgpu_kernel void @flat_cluster_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -12221,6 +15237,12 @@ define amdgpu_kernel void @flat_cluster_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -12252,6 +15274,12 @@ define amdgpu_kernel void @flat_cluster_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12273,6 +15301,12 @@ define amdgpu_kernel void @flat_cluster_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12292,6 +15326,12 @@ define amdgpu_kernel void @flat_cluster_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_acq_rel_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12312,6 +15352,12 @@ define amdgpu_kernel void @flat_cluster_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_cluster_acq_rel_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12332,6 +15378,12 @@ define amdgpu_kernel void @flat_cluster_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_cluster_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12355,6 +15407,12 @@ define amdgpu_kernel void @flat_cluster_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: flat_cluster_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12378,6 +15436,12 @@ define amdgpu_kernel void @flat_cluster_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_cluster_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -12402,6 +15466,12 @@ define amdgpu_kernel void @flat_cluster_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_cluster_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -12426,7 +15496,13 @@ define amdgpu_kernel void @flat_cluster_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: flat_cluster_acq_rel_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -12460,6 +15536,12 @@ define amdgpu_kernel void @flat_cluster_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -12495,6 +15577,12 @@ define amdgpu_kernel void @flat_cluster_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -12532,6 +15620,12 @@ define amdgpu_kernel void @flat_cluster_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -12565,6 +15659,12 @@ define amdgpu_kernel void @flat_cluster_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -12596,6 +15696,12 @@ define amdgpu_kernel void @flat_cluster_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12617,6 +15723,12 @@ define amdgpu_kernel void @flat_cluster_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12636,6 +15748,12 @@ define amdgpu_kernel void @flat_cluster_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_seq_cst_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12656,6 +15774,12 @@ define amdgpu_kernel void @flat_cluster_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_cluster_seq_cst_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12676,6 +15800,12 @@ define amdgpu_kernel void @flat_cluster_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_cluster_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12699,6 +15829,12 @@ define amdgpu_kernel void @flat_cluster_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: flat_cluster_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12722,6 +15858,12 @@ define amdgpu_kernel void @flat_cluster_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_cluster_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -12746,6 +15888,12 @@ define amdgpu_kernel void @flat_cluster_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_cluster_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -12770,7 +15918,13 @@ define amdgpu_kernel void @flat_cluster_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: flat_cluster_seq_cst_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -12802,15 +15956,19 @@ define amdgpu_kernel void @flat_cluster_one_as_unordered_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -12820,15 +15978,19 @@ define amdgpu_kernel void @flat_cluster_one_as_unordered_load( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -12838,29 +16000,37 @@ define amdgpu_kernel void @flat_cluster_one_as_unordered_load( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_unordered_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -12868,13 +16038,17 @@ define amdgpu_kernel void @flat_cluster_one_as_unordered_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -12882,11 +16056,15 @@ define amdgpu_kernel void @flat_cluster_one_as_unordered_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -12894,23 +16072,31 @@ define amdgpu_kernel void @flat_cluster_one_as_unordered_load( ; ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_unordered_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_unordered_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -12918,40 +16104,52 @@ define amdgpu_kernel void @flat_cluster_one_as_unordered_load( ; ; GFX11-WGP-LABEL: flat_cluster_one_as_unordered_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_cluster_one_as_unordered_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_cluster_one_as_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -12960,12 +16158,16 @@ define amdgpu_kernel void @flat_cluster_one_as_unordered_load( ; ; GFX12-CU-LABEL: flat_cluster_one_as_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -12975,12 +16177,16 @@ define amdgpu_kernel void @flat_cluster_one_as_unordered_load( ; GFX1250-LABEL: flat_cluster_one_as_unordered_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { @@ -12996,15 +16202,19 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] glc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -13014,15 +16224,19 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_load( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc dlc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -13032,29 +16246,37 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_load( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] glc dlc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_monotonic_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -13062,13 +16284,17 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -13076,11 +16302,15 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -13088,23 +16318,31 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_load( ; ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_monotonic_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_monotonic_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -13112,40 +16350,52 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_load( ; ; GFX11-WGP-LABEL: flat_cluster_one_as_monotonic_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_cluster_one_as_monotonic_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_cluster_one_as_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -13154,12 +16404,16 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_load( ; ; GFX12-CU-LABEL: flat_cluster_one_as_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -13169,12 +16423,16 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_load( ; GFX1250-LABEL: flat_cluster_one_as_monotonic_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SE ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { @@ -13190,17 +16448,20 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -13210,18 +16471,21 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_load( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc dlc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -13231,33 +16495,38 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_load( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] glc dlc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_acquire_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] glc -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -13265,15 +16534,18 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -13281,88 +16553,109 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_acquire_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc1 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 -; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_acquire_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_cluster_one_as_acquire_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_cluster_one_as_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_cluster_one_as_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0 @@ -13371,14 +16664,18 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_load( ; ; GFX12-CU-LABEL: flat_cluster_one_as_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 @@ -13388,14 +16685,18 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_load( ; GFX1250-LABEL: flat_cluster_one_as_acquire_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SE ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SE ; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { @@ -13411,18 +16712,21 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_load_dword v2, v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -13432,9 +16736,12 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_load( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) @@ -13443,9 +16750,9 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_load( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -13455,9 +16762,12 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_load( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) @@ -13466,25 +16776,27 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_load( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_seq_cst_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] glc -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -13492,16 +16804,19 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -13509,52 +16824,66 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_seq_cst_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc1 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 -; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_seq_cst_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_cluster_one_as_seq_cst_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) @@ -13563,17 +16892,20 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_load( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_cluster_one_as_seq_cst_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) @@ -13582,17 +16914,20 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_load( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_cluster_one_as_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 @@ -13602,6 +16937,7 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_load( ; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0 @@ -13610,9 +16946,12 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_load( ; ; GFX12-CU-LABEL: flat_cluster_one_as_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 @@ -13622,6 +16961,7 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_load( ; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 @@ -13631,16 +16971,20 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_load( ; GFX1250-LABEL: flat_cluster_one_as_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SE ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SE ; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { @@ -13657,6 +17001,10 @@ define amdgpu_kernel void @flat_cluster_one_as_unordered_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -13672,6 +17020,10 @@ define amdgpu_kernel void @flat_cluster_one_as_unordered_store( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 @@ -13687,6 +17039,10 @@ define amdgpu_kernel void @flat_cluster_one_as_unordered_store( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 @@ -13698,6 +17054,10 @@ define amdgpu_kernel void @flat_cluster_one_as_unordered_store( ; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_unordered_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -13711,6 +17071,10 @@ define amdgpu_kernel void @flat_cluster_one_as_unordered_store( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -13723,6 +17087,10 @@ define amdgpu_kernel void @flat_cluster_one_as_unordered_store( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -13733,6 +17101,10 @@ define amdgpu_kernel void @flat_cluster_one_as_unordered_store( ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_unordered_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -13743,6 +17115,10 @@ define amdgpu_kernel void @flat_cluster_one_as_unordered_store( ; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_unordered_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -13753,6 +17129,10 @@ define amdgpu_kernel void @flat_cluster_one_as_unordered_store( ; GFX11-WGP-LABEL: flat_cluster_one_as_unordered_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -13764,6 +17144,10 @@ define amdgpu_kernel void @flat_cluster_one_as_unordered_store( ; GFX11-CU-LABEL: flat_cluster_one_as_unordered_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -13775,6 +17159,10 @@ define amdgpu_kernel void @flat_cluster_one_as_unordered_store( ; GFX12-WGP-LABEL: flat_cluster_one_as_unordered_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -13786,6 +17174,10 @@ define amdgpu_kernel void @flat_cluster_one_as_unordered_store( ; GFX12-CU-LABEL: flat_cluster_one_as_unordered_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -13797,12 +17189,16 @@ define amdgpu_kernel void @flat_cluster_one_as_unordered_store( ; GFX1250-LABEL: flat_cluster_one_as_unordered_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { @@ -13818,6 +17214,10 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -13833,6 +17233,10 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_store( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 @@ -13848,6 +17252,10 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_store( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 @@ -13859,6 +17267,10 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_store( ; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_monotonic_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -13872,6 +17284,10 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_store( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -13884,6 +17300,10 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_store( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -13894,6 +17314,10 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_store( ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_monotonic_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -13904,6 +17328,10 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_store( ; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_monotonic_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -13914,6 +17342,10 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_store( ; GFX11-WGP-LABEL: flat_cluster_one_as_monotonic_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -13925,6 +17357,10 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_store( ; GFX11-CU-LABEL: flat_cluster_one_as_monotonic_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -13936,6 +17372,10 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_store( ; GFX12-WGP-LABEL: flat_cluster_one_as_monotonic_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -13947,6 +17387,10 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_store( ; GFX12-CU-LABEL: flat_cluster_one_as_monotonic_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -13958,12 +17402,16 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_store( ; GFX1250-LABEL: flat_cluster_one_as_monotonic_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { @@ -13979,6 +17427,10 @@ define amdgpu_kernel void @flat_cluster_one_as_release_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -13995,6 +17447,10 @@ define amdgpu_kernel void @flat_cluster_one_as_release_store( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 @@ -14012,6 +17468,10 @@ define amdgpu_kernel void @flat_cluster_one_as_release_store( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 @@ -14025,6 +17485,10 @@ define amdgpu_kernel void @flat_cluster_one_as_release_store( ; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_release_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -14039,6 +17503,10 @@ define amdgpu_kernel void @flat_cluster_one_as_release_store( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -14052,6 +17520,10 @@ define amdgpu_kernel void @flat_cluster_one_as_release_store( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -14063,6 +17535,10 @@ define amdgpu_kernel void @flat_cluster_one_as_release_store( ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_release_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -14075,6 +17551,10 @@ define amdgpu_kernel void @flat_cluster_one_as_release_store( ; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_release_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -14087,6 +17567,10 @@ define amdgpu_kernel void @flat_cluster_one_as_release_store( ; GFX11-WGP-LABEL: flat_cluster_one_as_release_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -14100,6 +17584,10 @@ define amdgpu_kernel void @flat_cluster_one_as_release_store( ; GFX11-CU-LABEL: flat_cluster_one_as_release_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -14113,6 +17601,10 @@ define amdgpu_kernel void @flat_cluster_one_as_release_store( ; GFX12-WGP-LABEL: flat_cluster_one_as_release_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -14128,6 +17620,10 @@ define amdgpu_kernel void @flat_cluster_one_as_release_store( ; GFX12-CU-LABEL: flat_cluster_one_as_release_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -14143,14 +17639,18 @@ define amdgpu_kernel void @flat_cluster_one_as_release_store( ; GFX1250-LABEL: flat_cluster_one_as_release_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { @@ -14166,6 +17666,10 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -14182,6 +17686,10 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_store( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 @@ -14199,6 +17707,10 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_store( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 @@ -14212,6 +17724,10 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_store( ; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_seq_cst_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -14226,6 +17742,10 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_store( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -14239,6 +17759,10 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_store( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -14250,6 +17774,10 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_store( ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_seq_cst_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -14262,6 +17790,10 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_store( ; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_seq_cst_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -14274,6 +17806,10 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_store( ; GFX11-WGP-LABEL: flat_cluster_one_as_seq_cst_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -14287,6 +17823,10 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_store( ; GFX11-CU-LABEL: flat_cluster_one_as_seq_cst_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -14300,6 +17840,10 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_store( ; GFX12-WGP-LABEL: flat_cluster_one_as_seq_cst_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -14315,6 +17859,10 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_store( ; GFX12-CU-LABEL: flat_cluster_one_as_seq_cst_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -14330,14 +17878,18 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_store( ; GFX1250-LABEL: flat_cluster_one_as_seq_cst_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { @@ -14352,11 +17904,15 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm @@ -14367,11 +17923,15 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_atomicrmw( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm @@ -14382,22 +17942,30 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_atomicrmw( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_monotonic_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -14406,10 +17974,14 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_atomicrmw( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -14418,74 +17990,102 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_atomicrmw( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_monotonic_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_monotonic_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_cluster_one_as_monotonic_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_cluster_one_as_monotonic_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_cluster_one_as_monotonic_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_cluster_one_as_monotonic_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -14493,7 +18093,11 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_atomicrmw( ; GFX1250-LABEL: flat_cluster_one_as_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -14513,11 +18117,15 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -14530,11 +18138,15 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_atomicrmw( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -14548,11 +18160,15 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_atomicrmw( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -14562,11 +18178,15 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_acquire_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -14576,10 +18196,14 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_atomicrmw( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -14590,10 +18214,14 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_atomicrmw( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -14602,10 +18230,14 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_acquire_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -14614,10 +18246,14 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_acquire_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -14626,11 +18262,15 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_atomicrmw( ; ; GFX11-WGP-LABEL: flat_cluster_one_as_acquire_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -14640,11 +18280,15 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_atomicrmw( ; ; GFX11-CU-LABEL: flat_cluster_one_as_acquire_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -14654,11 +18298,15 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_atomicrmw( ; ; GFX12-WGP-LABEL: flat_cluster_one_as_acquire_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -14667,11 +18315,15 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_atomicrmw( ; ; GFX12-CU-LABEL: flat_cluster_one_as_acquire_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 @@ -14681,7 +18333,11 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_atomicrmw( ; GFX1250-LABEL: flat_cluster_one_as_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -14703,11 +18359,15 @@ define amdgpu_kernel void @flat_cluster_one_as_release_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 @@ -14719,11 +18379,15 @@ define amdgpu_kernel void @flat_cluster_one_as_release_atomicrmw( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -14736,11 +18400,15 @@ define amdgpu_kernel void @flat_cluster_one_as_release_atomicrmw( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -14749,11 +18417,15 @@ define amdgpu_kernel void @flat_cluster_one_as_release_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_release_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 @@ -14763,10 +18435,14 @@ define amdgpu_kernel void @flat_cluster_one_as_release_atomicrmw( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 @@ -14776,10 +18452,14 @@ define amdgpu_kernel void @flat_cluster_one_as_release_atomicrmw( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 @@ -14787,10 +18467,14 @@ define amdgpu_kernel void @flat_cluster_one_as_release_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_release_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -14799,10 +18483,14 @@ define amdgpu_kernel void @flat_cluster_one_as_release_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_release_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -14811,11 +18499,15 @@ define amdgpu_kernel void @flat_cluster_one_as_release_atomicrmw( ; ; GFX11-WGP-LABEL: flat_cluster_one_as_release_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -14824,11 +18516,15 @@ define amdgpu_kernel void @flat_cluster_one_as_release_atomicrmw( ; ; GFX11-CU-LABEL: flat_cluster_one_as_release_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -14837,11 +18533,15 @@ define amdgpu_kernel void @flat_cluster_one_as_release_atomicrmw( ; ; GFX12-WGP-LABEL: flat_cluster_one_as_release_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 @@ -14852,11 +18552,15 @@ define amdgpu_kernel void @flat_cluster_one_as_release_atomicrmw( ; ; GFX12-CU-LABEL: flat_cluster_one_as_release_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 @@ -14868,7 +18572,11 @@ define amdgpu_kernel void @flat_cluster_one_as_release_atomicrmw( ; GFX1250-LABEL: flat_cluster_one_as_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -14890,11 +18598,15 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 @@ -14908,11 +18620,15 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_atomicrmw( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -14928,11 +18644,15 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_atomicrmw( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -14944,11 +18664,15 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_acq_rel_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 @@ -14959,10 +18683,14 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 @@ -14974,10 +18702,14 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_atomicrmw( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 @@ -14987,10 +18719,14 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_acq_rel_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -15001,10 +18737,14 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_acq_rel_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -15015,11 +18755,15 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_atomicrmw( ; ; GFX11-WGP-LABEL: flat_cluster_one_as_acq_rel_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -15031,11 +18775,15 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_atomicrmw( ; ; GFX11-CU-LABEL: flat_cluster_one_as_acq_rel_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -15047,11 +18795,15 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_atomicrmw( ; ; GFX12-WGP-LABEL: flat_cluster_one_as_acq_rel_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 @@ -15064,11 +18816,15 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_atomicrmw( ; ; GFX12-CU-LABEL: flat_cluster_one_as_acq_rel_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 @@ -15082,7 +18838,11 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_atomicrmw( ; GFX1250-LABEL: flat_cluster_one_as_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -15106,11 +18866,15 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 @@ -15124,11 +18888,15 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_atomicrmw( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -15144,11 +18912,15 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_atomicrmw( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -15160,11 +18932,15 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_seq_cst_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 @@ -15175,10 +18951,14 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 @@ -15190,10 +18970,14 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_atomicrmw( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 @@ -15203,10 +18987,14 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_seq_cst_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -15217,10 +19005,14 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_seq_cst_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -15231,11 +19023,15 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_atomicrmw( ; ; GFX11-WGP-LABEL: flat_cluster_one_as_seq_cst_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -15247,11 +19043,15 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_atomicrmw( ; ; GFX11-CU-LABEL: flat_cluster_one_as_seq_cst_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -15263,11 +19063,15 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_atomicrmw( ; ; GFX12-WGP-LABEL: flat_cluster_one_as_seq_cst_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 @@ -15280,11 +19084,15 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_atomicrmw( ; ; GFX12-CU-LABEL: flat_cluster_one_as_seq_cst_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 @@ -15298,7 +19106,11 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_atomicrmw( ; GFX1250-LABEL: flat_cluster_one_as_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -15323,6 +19135,10 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_ret_atomicrmw( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -15344,6 +19160,10 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_ret_atomicrmw( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -15366,6 +19186,10 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_ret_atomicrmw( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -15384,6 +19208,10 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -15402,6 +19230,10 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] @@ -15419,6 +19251,10 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] @@ -15433,6 +19269,10 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_acquire_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -15448,6 +19288,10 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_acquire_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -15462,6 +19306,10 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_ret_atomicrmw( ; GFX11-WGP-LABEL: flat_cluster_one_as_acquire_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -15480,6 +19328,10 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_ret_atomicrmw( ; GFX11-CU-LABEL: flat_cluster_one_as_acquire_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -15498,6 +19350,10 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_ret_atomicrmw( ; GFX12-WGP-LABEL: flat_cluster_one_as_acquire_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -15515,6 +19371,10 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_ret_atomicrmw( ; GFX12-CU-LABEL: flat_cluster_one_as_acquire_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -15532,7 +19392,11 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_ret_atomicrmw( ; GFX1250-LABEL: flat_cluster_one_as_acquire_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -15558,6 +19422,10 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_ret_atomicrmw( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -15580,6 +19448,10 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_ret_atomicrmw( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -15604,6 +19476,10 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_ret_atomicrmw( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -15624,6 +19500,10 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -15643,6 +19523,10 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] @@ -15661,6 +19545,10 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] @@ -15676,6 +19564,10 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_acq_rel_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -15693,6 +19585,10 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_acq_rel_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -15709,6 +19605,10 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_ret_atomicrmw( ; GFX11-WGP-LABEL: flat_cluster_one_as_acq_rel_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -15729,6 +19629,10 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_ret_atomicrmw( ; GFX11-CU-LABEL: flat_cluster_one_as_acq_rel_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -15749,6 +19653,10 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_ret_atomicrmw( ; GFX12-WGP-LABEL: flat_cluster_one_as_acq_rel_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -15770,6 +19678,10 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-LABEL: flat_cluster_one_as_acq_rel_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -15791,7 +19703,11 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_ret_atomicrmw( ; GFX1250-LABEL: flat_cluster_one_as_acq_rel_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -15819,6 +19735,10 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_ret_atomicrmw( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -15841,6 +19761,10 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_ret_atomicrmw( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -15865,6 +19789,10 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_ret_atomicrmw( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -15885,6 +19813,10 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -15904,6 +19836,10 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] @@ -15922,6 +19858,10 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] @@ -15937,6 +19877,10 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_seq_cst_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -15954,6 +19898,10 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_seq_cst_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -15970,6 +19918,10 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_ret_atomicrmw( ; GFX11-WGP-LABEL: flat_cluster_one_as_seq_cst_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -15990,6 +19942,10 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_ret_atomicrmw( ; GFX11-CU-LABEL: flat_cluster_one_as_seq_cst_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -16010,6 +19966,10 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_ret_atomicrmw( ; GFX12-WGP-LABEL: flat_cluster_one_as_seq_cst_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -16031,6 +19991,10 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-LABEL: flat_cluster_one_as_seq_cst_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -16052,7 +20016,11 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_ret_atomicrmw( ; GFX1250-LABEL: flat_cluster_one_as_seq_cst_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -16080,11 +20048,16 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -16094,6 +20067,7 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -16109,11 +20083,16 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -16123,6 +20102,7 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_monotonic_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -16138,11 +20118,16 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -16152,6 +20137,7 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_monotonic_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -16163,11 +20149,16 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_monotonic_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -16177,6 +20168,7 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -16190,6 +20182,12 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16206,6 +20204,12 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16220,6 +20224,12 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_monotonic_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16234,6 +20244,12 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_monotonic_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16248,6 +20264,12 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_monotonic_cmpxchg( ; GFX11-WGP-LABEL: flat_cluster_one_as_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16263,6 +20285,12 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_monotonic_cmpxchg( ; GFX11-CU-LABEL: flat_cluster_one_as_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16278,6 +20306,12 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_cluster_one_as_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -16293,6 +20327,12 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_cluster_one_as_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -16308,7 +20348,13 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_monotonic_cmpxchg( ; GFX1250-LABEL: flat_cluster_one_as_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -16334,11 +20380,16 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -16348,6 +20399,7 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -16365,11 +20417,16 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -16379,6 +20436,7 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_monotonic_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -16397,11 +20455,16 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -16411,6 +20474,7 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_monotonic_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -16425,11 +20489,16 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -16439,6 +20508,7 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -16453,6 +20523,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16471,6 +20547,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16487,6 +20569,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_acquire_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16503,6 +20591,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_acquire_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16519,6 +20613,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_monotonic_cmpxchg( ; GFX11-WGP-LABEL: flat_cluster_one_as_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16537,6 +20637,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_monotonic_cmpxchg( ; GFX11-CU-LABEL: flat_cluster_one_as_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16555,6 +20661,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_cluster_one_as_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -16572,6 +20684,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_cluster_one_as_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -16589,7 +20707,13 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_monotonic_cmpxchg( ; GFX1250-LABEL: flat_cluster_one_as_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -16617,11 +20741,16 @@ define amdgpu_kernel void @flat_cluster_one_as_release_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -16631,6 +20760,7 @@ define amdgpu_kernel void @flat_cluster_one_as_release_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -16647,11 +20777,16 @@ define amdgpu_kernel void @flat_cluster_one_as_release_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -16661,6 +20796,7 @@ define amdgpu_kernel void @flat_cluster_one_as_release_monotonic_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -16678,11 +20814,16 @@ define amdgpu_kernel void @flat_cluster_one_as_release_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -16692,6 +20833,7 @@ define amdgpu_kernel void @flat_cluster_one_as_release_monotonic_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -16705,11 +20847,16 @@ define amdgpu_kernel void @flat_cluster_one_as_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -16719,6 +20866,7 @@ define amdgpu_kernel void @flat_cluster_one_as_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -16733,6 +20881,12 @@ define amdgpu_kernel void @flat_cluster_one_as_release_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16750,6 +20904,12 @@ define amdgpu_kernel void @flat_cluster_one_as_release_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16765,6 +20925,12 @@ define amdgpu_kernel void @flat_cluster_one_as_release_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_release_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16781,6 +20947,12 @@ define amdgpu_kernel void @flat_cluster_one_as_release_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_release_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16797,6 +20969,12 @@ define amdgpu_kernel void @flat_cluster_one_as_release_monotonic_cmpxchg( ; GFX11-WGP-LABEL: flat_cluster_one_as_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16814,6 +20992,12 @@ define amdgpu_kernel void @flat_cluster_one_as_release_monotonic_cmpxchg( ; GFX11-CU-LABEL: flat_cluster_one_as_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16831,6 +21015,12 @@ define amdgpu_kernel void @flat_cluster_one_as_release_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_cluster_one_as_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -16850,6 +21040,12 @@ define amdgpu_kernel void @flat_cluster_one_as_release_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_cluster_one_as_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -16869,7 +21065,13 @@ define amdgpu_kernel void @flat_cluster_one_as_release_monotonic_cmpxchg( ; GFX1250-LABEL: flat_cluster_one_as_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -16897,11 +21099,16 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -16911,6 +21118,7 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -16929,11 +21137,16 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -16943,6 +21156,7 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -16963,11 +21177,16 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -16977,6 +21196,7 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -16993,11 +21213,16 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -17007,6 +21232,7 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -17022,6 +21248,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17041,6 +21273,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17058,6 +21296,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_acq_rel_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17076,6 +21320,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_acq_rel_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17094,6 +21344,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_monotonic_cmpxchg( ; GFX11-WGP-LABEL: flat_cluster_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17114,6 +21370,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_monotonic_cmpxchg( ; GFX11-CU-LABEL: flat_cluster_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17134,6 +21396,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_cluster_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -17155,6 +21423,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_cluster_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -17176,7 +21450,13 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_monotonic_cmpxchg( ; GFX1250-LABEL: flat_cluster_one_as_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -17206,11 +21486,16 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -17220,6 +21505,7 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -17238,11 +21524,16 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -17252,6 +21543,7 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -17272,11 +21564,16 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -17286,6 +21583,7 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -17302,11 +21600,16 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -17316,6 +21619,7 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -17331,6 +21635,12 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17350,6 +21660,12 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17367,6 +21683,12 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_seq_cst_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17385,6 +21707,12 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_seq_cst_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17403,6 +21731,12 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_monotonic_cmpxchg( ; GFX11-WGP-LABEL: flat_cluster_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17423,6 +21757,12 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_monotonic_cmpxchg( ; GFX11-CU-LABEL: flat_cluster_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17443,6 +21783,12 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_cluster_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -17464,6 +21810,12 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_cluster_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -17485,7 +21837,13 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_monotonic_cmpxchg( ; GFX1250-LABEL: flat_cluster_one_as_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -17515,11 +21873,16 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -17529,6 +21892,7 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -17546,11 +21910,16 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -17560,6 +21929,7 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_acquire_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -17578,11 +21948,16 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_acquire_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -17592,6 +21967,7 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_acquire_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -17606,11 +21982,16 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -17620,6 +22001,7 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -17634,6 +22016,12 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17652,6 +22040,12 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17668,6 +22062,12 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_monotonic_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17684,6 +22084,12 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_monotonic_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17700,6 +22106,12 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_acquire_cmpxchg( ; GFX11-WGP-LABEL: flat_cluster_one_as_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17718,6 +22130,12 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_acquire_cmpxchg( ; GFX11-CU-LABEL: flat_cluster_one_as_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17736,6 +22154,12 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_cluster_one_as_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -17753,6 +22177,12 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_cluster_one_as_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -17770,7 +22200,13 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_acquire_cmpxchg( ; GFX1250-LABEL: flat_cluster_one_as_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -17798,11 +22234,16 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -17812,6 +22253,7 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -17829,11 +22271,16 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -17843,6 +22290,7 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_acquire_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -17861,11 +22309,16 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_acquire_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -17875,6 +22328,7 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_acquire_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -17889,11 +22343,16 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -17903,6 +22362,7 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -17917,6 +22377,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17935,6 +22401,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17951,6 +22423,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_acquire_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17967,6 +22445,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_acquire_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17983,6 +22467,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_acquire_cmpxchg( ; GFX11-WGP-LABEL: flat_cluster_one_as_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -18001,6 +22491,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_acquire_cmpxchg( ; GFX11-CU-LABEL: flat_cluster_one_as_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -18019,6 +22515,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_cluster_one_as_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -18036,6 +22538,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_cluster_one_as_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -18053,7 +22561,13 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_acquire_cmpxchg( ; GFX1250-LABEL: flat_cluster_one_as_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -18081,11 +22595,16 @@ define amdgpu_kernel void @flat_cluster_one_as_release_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -18095,6 +22614,7 @@ define amdgpu_kernel void @flat_cluster_one_as_release_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -18113,11 +22633,16 @@ define amdgpu_kernel void @flat_cluster_one_as_release_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -18127,6 +22652,7 @@ define amdgpu_kernel void @flat_cluster_one_as_release_acquire_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -18147,11 +22673,16 @@ define amdgpu_kernel void @flat_cluster_one_as_release_acquire_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -18161,6 +22692,7 @@ define amdgpu_kernel void @flat_cluster_one_as_release_acquire_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -18177,11 +22709,16 @@ define amdgpu_kernel void @flat_cluster_one_as_release_acquire_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -18191,6 +22728,7 @@ define amdgpu_kernel void @flat_cluster_one_as_release_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -18206,6 +22744,12 @@ define amdgpu_kernel void @flat_cluster_one_as_release_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18225,6 +22769,12 @@ define amdgpu_kernel void @flat_cluster_one_as_release_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18242,6 +22792,12 @@ define amdgpu_kernel void @flat_cluster_one_as_release_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_release_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18260,6 +22816,12 @@ define amdgpu_kernel void @flat_cluster_one_as_release_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_release_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18278,6 +22840,12 @@ define amdgpu_kernel void @flat_cluster_one_as_release_acquire_cmpxchg( ; GFX11-WGP-LABEL: flat_cluster_one_as_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -18298,6 +22866,12 @@ define amdgpu_kernel void @flat_cluster_one_as_release_acquire_cmpxchg( ; GFX11-CU-LABEL: flat_cluster_one_as_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -18318,6 +22892,12 @@ define amdgpu_kernel void @flat_cluster_one_as_release_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_cluster_one_as_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -18339,6 +22919,12 @@ define amdgpu_kernel void @flat_cluster_one_as_release_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_cluster_one_as_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -18360,7 +22946,13 @@ define amdgpu_kernel void @flat_cluster_one_as_release_acquire_cmpxchg( ; GFX1250-LABEL: flat_cluster_one_as_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -18390,11 +22982,16 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -18404,6 +23001,7 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -18422,11 +23020,16 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -18436,6 +23039,7 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_acquire_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -18456,11 +23060,16 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_acquire_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -18470,6 +23079,7 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_acquire_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -18486,11 +23096,16 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -18500,6 +23115,7 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -18515,6 +23131,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18534,6 +23156,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18551,6 +23179,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_acq_rel_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18569,6 +23203,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_acq_rel_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18587,6 +23227,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_acquire_cmpxchg( ; GFX11-WGP-LABEL: flat_cluster_one_as_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -18607,6 +23253,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_acquire_cmpxchg( ; GFX11-CU-LABEL: flat_cluster_one_as_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -18627,6 +23279,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_cluster_one_as_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -18648,6 +23306,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_cluster_one_as_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -18669,7 +23333,13 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_acquire_cmpxchg( ; GFX1250-LABEL: flat_cluster_one_as_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -18699,11 +23369,16 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -18713,6 +23388,7 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -18731,11 +23407,16 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -18745,6 +23426,7 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_acquire_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -18765,11 +23447,16 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_acquire_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -18779,6 +23466,7 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_acquire_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -18795,11 +23483,16 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -18809,6 +23502,7 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -18824,6 +23518,12 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18843,6 +23543,12 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18860,6 +23566,12 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_seq_cst_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18878,6 +23590,12 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_seq_cst_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18896,6 +23614,12 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_acquire_cmpxchg( ; GFX11-WGP-LABEL: flat_cluster_one_as_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -18916,6 +23640,12 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_acquire_cmpxchg( ; GFX11-CU-LABEL: flat_cluster_one_as_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -18936,6 +23666,12 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_cluster_one_as_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -18957,6 +23693,12 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_cluster_one_as_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -18978,7 +23720,13 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_acquire_cmpxchg( ; GFX1250-LABEL: flat_cluster_one_as_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -19008,11 +23756,16 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -19022,6 +23775,7 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -19040,11 +23794,16 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -19054,6 +23813,7 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -19074,11 +23834,16 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -19088,6 +23853,7 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_seq_cst_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -19104,11 +23870,16 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -19118,6 +23889,7 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -19133,6 +23905,12 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19152,6 +23930,12 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19169,6 +23953,12 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_monotonic_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19187,6 +23977,12 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_monotonic_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19205,6 +24001,12 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: flat_cluster_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -19225,6 +24027,12 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_seq_cst_cmpxchg( ; GFX11-CU-LABEL: flat_cluster_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -19245,6 +24053,12 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_cluster_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -19266,6 +24080,12 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_cluster_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -19287,7 +24107,13 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_seq_cst_cmpxchg( ; GFX1250-LABEL: flat_cluster_one_as_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -19317,11 +24143,16 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -19331,6 +24162,7 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -19349,11 +24181,16 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -19363,6 +24200,7 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -19383,11 +24221,16 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -19397,6 +24240,7 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_seq_cst_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -19413,11 +24257,16 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -19427,6 +24276,7 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -19442,6 +24292,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19461,6 +24317,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19478,6 +24340,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_acquire_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19496,6 +24364,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_acquire_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19514,6 +24388,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: flat_cluster_one_as_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -19534,6 +24414,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_seq_cst_cmpxchg( ; GFX11-CU-LABEL: flat_cluster_one_as_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -19554,6 +24440,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_cluster_one_as_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -19575,6 +24467,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_cluster_one_as_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -19596,7 +24494,13 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_seq_cst_cmpxchg( ; GFX1250-LABEL: flat_cluster_one_as_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -19626,11 +24530,16 @@ define amdgpu_kernel void @flat_cluster_one_as_release_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -19640,6 +24549,7 @@ define amdgpu_kernel void @flat_cluster_one_as_release_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -19658,11 +24568,16 @@ define amdgpu_kernel void @flat_cluster_one_as_release_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -19672,6 +24587,7 @@ define amdgpu_kernel void @flat_cluster_one_as_release_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -19692,11 +24608,16 @@ define amdgpu_kernel void @flat_cluster_one_as_release_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -19706,6 +24627,7 @@ define amdgpu_kernel void @flat_cluster_one_as_release_seq_cst_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -19722,11 +24644,16 @@ define amdgpu_kernel void @flat_cluster_one_as_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -19736,6 +24663,7 @@ define amdgpu_kernel void @flat_cluster_one_as_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -19751,6 +24679,12 @@ define amdgpu_kernel void @flat_cluster_one_as_release_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19770,6 +24704,12 @@ define amdgpu_kernel void @flat_cluster_one_as_release_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19787,6 +24727,12 @@ define amdgpu_kernel void @flat_cluster_one_as_release_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_release_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19805,6 +24751,12 @@ define amdgpu_kernel void @flat_cluster_one_as_release_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_release_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19823,6 +24775,12 @@ define amdgpu_kernel void @flat_cluster_one_as_release_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: flat_cluster_one_as_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -19843,6 +24801,12 @@ define amdgpu_kernel void @flat_cluster_one_as_release_seq_cst_cmpxchg( ; GFX11-CU-LABEL: flat_cluster_one_as_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -19863,6 +24827,12 @@ define amdgpu_kernel void @flat_cluster_one_as_release_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_cluster_one_as_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -19884,6 +24854,12 @@ define amdgpu_kernel void @flat_cluster_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_cluster_one_as_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -19905,7 +24881,13 @@ define amdgpu_kernel void @flat_cluster_one_as_release_seq_cst_cmpxchg( ; GFX1250-LABEL: flat_cluster_one_as_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -19935,11 +24917,16 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -19949,6 +24936,7 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -19967,11 +24955,16 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -19981,6 +24974,7 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -20001,11 +24995,16 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -20015,6 +25014,7 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_seq_cst_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -20031,11 +25031,16 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -20045,6 +25050,7 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -20060,6 +25066,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20079,6 +25091,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20096,6 +25114,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_acq_rel_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20114,6 +25138,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_acq_rel_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20132,6 +25162,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: flat_cluster_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -20152,6 +25188,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_seq_cst_cmpxchg( ; GFX11-CU-LABEL: flat_cluster_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -20172,6 +25214,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_cluster_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -20193,6 +25241,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_cluster_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -20214,7 +25268,13 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_seq_cst_cmpxchg( ; GFX1250-LABEL: flat_cluster_one_as_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -20244,11 +25304,16 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -20258,6 +25323,7 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -20276,11 +25342,16 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -20290,6 +25361,7 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -20310,11 +25382,16 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -20324,6 +25401,7 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -20340,11 +25418,16 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -20354,6 +25437,7 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -20369,6 +25453,12 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20388,6 +25478,12 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20405,6 +25501,12 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_seq_cst_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20423,6 +25525,12 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_seq_cst_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20441,6 +25549,12 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: flat_cluster_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -20461,6 +25575,12 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_seq_cst_cmpxchg( ; GFX11-CU-LABEL: flat_cluster_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -20481,6 +25601,12 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_cluster_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -20502,6 +25628,12 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_cluster_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -20523,7 +25655,13 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_seq_cst_cmpxchg( ; GFX1250-LABEL: flat_cluster_one_as_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -20554,6 +25692,12 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -20587,6 +25731,12 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -20620,6 +25770,12 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -20649,6 +25805,12 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -20679,6 +25841,12 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20698,6 +25866,12 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20715,6 +25889,12 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20732,6 +25912,12 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20749,6 +25935,12 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_cluster_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -20768,6 +25960,12 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: flat_cluster_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -20787,6 +25985,12 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_cluster_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -20806,6 +26010,12 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_cluster_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -20825,7 +26035,13 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX1250-LABEL: flat_cluster_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -20856,6 +26072,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -20891,6 +26113,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -20927,6 +26155,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -20959,6 +26193,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -20990,6 +26230,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21011,6 +26257,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21029,6 +26281,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_acquire_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21048,6 +26306,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_acquire_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21066,6 +26330,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_cluster_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -21088,6 +26358,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: flat_cluster_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -21110,6 +26386,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_cluster_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -21131,6 +26413,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_cluster_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -21152,7 +26440,13 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_monotonic_ret_cmpxchg( ; GFX1250-LABEL: flat_cluster_one_as_acquire_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -21185,6 +26479,12 @@ define amdgpu_kernel void @flat_cluster_one_as_release_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -21219,6 +26519,12 @@ define amdgpu_kernel void @flat_cluster_one_as_release_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -21254,6 +26560,12 @@ define amdgpu_kernel void @flat_cluster_one_as_release_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -21285,6 +26597,12 @@ define amdgpu_kernel void @flat_cluster_one_as_release_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -21316,6 +26634,12 @@ define amdgpu_kernel void @flat_cluster_one_as_release_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21336,6 +26660,12 @@ define amdgpu_kernel void @flat_cluster_one_as_release_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21354,6 +26684,12 @@ define amdgpu_kernel void @flat_cluster_one_as_release_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_release_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21373,6 +26709,12 @@ define amdgpu_kernel void @flat_cluster_one_as_release_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_release_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21392,6 +26734,12 @@ define amdgpu_kernel void @flat_cluster_one_as_release_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_cluster_one_as_release_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -21413,6 +26761,12 @@ define amdgpu_kernel void @flat_cluster_one_as_release_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: flat_cluster_one_as_release_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -21434,6 +26788,12 @@ define amdgpu_kernel void @flat_cluster_one_as_release_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_cluster_one_as_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -21457,6 +26817,12 @@ define amdgpu_kernel void @flat_cluster_one_as_release_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_cluster_one_as_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -21480,7 +26846,13 @@ define amdgpu_kernel void @flat_cluster_one_as_release_monotonic_ret_cmpxchg( ; GFX1250-LABEL: flat_cluster_one_as_release_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -21513,6 +26885,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -21549,6 +26927,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -21587,6 +26971,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -21621,6 +27011,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -21653,6 +27049,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21675,6 +27077,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21694,6 +27102,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21715,6 +27129,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21735,6 +27155,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -21759,6 +27185,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: flat_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -21783,6 +27215,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -21808,6 +27246,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -21833,7 +27277,13 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX1250-LABEL: flat_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -21868,6 +27318,12 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -21904,6 +27360,12 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -21942,6 +27404,12 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -21976,6 +27444,12 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -22008,6 +27482,12 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -22030,6 +27510,12 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -22049,6 +27535,12 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -22070,6 +27562,12 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -22090,6 +27588,12 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -22114,6 +27618,12 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: flat_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -22138,6 +27648,12 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -22163,6 +27679,12 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -22188,7 +27710,13 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX1250-LABEL: flat_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -22223,6 +27751,12 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -22258,6 +27792,12 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -22294,6 +27834,12 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -22326,6 +27872,12 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -22357,6 +27909,12 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -22378,6 +27936,12 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -22396,6 +27960,12 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_monotonic_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -22415,6 +27985,12 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_monotonic_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -22433,6 +28009,12 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_cluster_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -22455,6 +28037,12 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: flat_cluster_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -22477,6 +28065,12 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_cluster_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -22498,6 +28092,12 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_cluster_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -22519,7 +28119,13 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_acquire_ret_cmpxchg( ; GFX1250-LABEL: flat_cluster_one_as_monotonic_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -22552,6 +28158,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -22587,6 +28199,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -22623,6 +28241,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -22655,6 +28279,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -22686,6 +28316,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -22707,6 +28343,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -22725,6 +28367,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_acquire_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -22744,6 +28392,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_acquire_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -22762,6 +28416,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_cluster_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -22784,6 +28444,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: flat_cluster_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -22806,6 +28472,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_cluster_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -22827,6 +28499,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_cluster_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -22848,7 +28526,13 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_acquire_ret_cmpxchg( ; GFX1250-LABEL: flat_cluster_one_as_acquire_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -22881,6 +28565,12 @@ define amdgpu_kernel void @flat_cluster_one_as_release_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -22917,6 +28607,12 @@ define amdgpu_kernel void @flat_cluster_one_as_release_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -22955,6 +28651,12 @@ define amdgpu_kernel void @flat_cluster_one_as_release_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -22989,6 +28691,12 @@ define amdgpu_kernel void @flat_cluster_one_as_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -23021,6 +28729,12 @@ define amdgpu_kernel void @flat_cluster_one_as_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -23043,6 +28757,12 @@ define amdgpu_kernel void @flat_cluster_one_as_release_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -23062,6 +28782,12 @@ define amdgpu_kernel void @flat_cluster_one_as_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_release_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -23083,6 +28809,12 @@ define amdgpu_kernel void @flat_cluster_one_as_release_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_release_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -23103,6 +28835,12 @@ define amdgpu_kernel void @flat_cluster_one_as_release_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_cluster_one_as_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -23127,6 +28865,12 @@ define amdgpu_kernel void @flat_cluster_one_as_release_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: flat_cluster_one_as_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -23151,6 +28895,12 @@ define amdgpu_kernel void @flat_cluster_one_as_release_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_cluster_one_as_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -23176,6 +28926,12 @@ define amdgpu_kernel void @flat_cluster_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_cluster_one_as_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -23201,7 +28957,13 @@ define amdgpu_kernel void @flat_cluster_one_as_release_acquire_ret_cmpxchg( ; GFX1250-LABEL: flat_cluster_one_as_release_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -23236,6 +28998,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -23272,6 +29040,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -23310,6 +29084,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -23344,6 +29124,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -23376,6 +29162,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -23398,6 +29190,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -23417,6 +29215,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -23438,6 +29242,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -23458,6 +29268,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_cluster_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -23482,6 +29298,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: flat_cluster_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -23506,6 +29328,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_cluster_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -23531,6 +29359,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_cluster_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -23556,7 +29390,13 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX1250-LABEL: flat_cluster_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -23591,6 +29431,12 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -23627,6 +29473,12 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -23665,6 +29517,12 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -23699,6 +29557,12 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -23731,6 +29595,12 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -23753,6 +29623,12 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -23772,6 +29648,12 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -23793,6 +29675,12 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -23813,6 +29701,12 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_cluster_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -23837,6 +29731,12 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: flat_cluster_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -23861,6 +29761,12 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_cluster_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -23886,6 +29792,12 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_cluster_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -23911,7 +29823,13 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX1250-LABEL: flat_cluster_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -23946,6 +29864,12 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -23982,6 +29906,12 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -24020,6 +29950,12 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -24054,6 +29990,12 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -24086,6 +30028,12 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -24108,6 +30056,12 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -24127,6 +30081,12 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -24148,6 +30108,12 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -24168,6 +30134,12 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -24192,6 +30164,12 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: flat_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -24216,6 +30194,12 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -24241,6 +30225,12 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -24266,7 +30256,13 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: flat_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -24301,6 +30297,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -24337,6 +30339,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -24375,6 +30383,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -24409,6 +30423,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -24441,6 +30461,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -24463,6 +30489,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -24482,6 +30514,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -24503,6 +30541,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -24523,6 +30567,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_cluster_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -24547,6 +30597,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: flat_cluster_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -24571,6 +30627,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_cluster_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -24596,6 +30658,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_cluster_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -24621,7 +30689,13 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: flat_cluster_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -24656,6 +30730,12 @@ define amdgpu_kernel void @flat_cluster_one_as_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -24692,6 +30772,12 @@ define amdgpu_kernel void @flat_cluster_one_as_release_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -24730,6 +30816,12 @@ define amdgpu_kernel void @flat_cluster_one_as_release_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -24764,6 +30856,12 @@ define amdgpu_kernel void @flat_cluster_one_as_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -24796,6 +30894,12 @@ define amdgpu_kernel void @flat_cluster_one_as_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -24818,6 +30922,12 @@ define amdgpu_kernel void @flat_cluster_one_as_release_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -24837,6 +30947,12 @@ define amdgpu_kernel void @flat_cluster_one_as_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_release_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -24858,6 +30974,12 @@ define amdgpu_kernel void @flat_cluster_one_as_release_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_release_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -24878,6 +31000,12 @@ define amdgpu_kernel void @flat_cluster_one_as_release_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_cluster_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -24902,6 +31030,12 @@ define amdgpu_kernel void @flat_cluster_one_as_release_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: flat_cluster_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -24926,6 +31060,12 @@ define amdgpu_kernel void @flat_cluster_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_cluster_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -24951,6 +31091,12 @@ define amdgpu_kernel void @flat_cluster_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_cluster_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -24976,7 +31122,13 @@ define amdgpu_kernel void @flat_cluster_one_as_release_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: flat_cluster_one_as_release_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -25011,6 +31163,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -25047,6 +31205,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -25085,6 +31249,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -25119,6 +31289,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -25151,6 +31327,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -25173,6 +31355,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -25192,6 +31380,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -25213,6 +31407,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -25233,6 +31433,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -25257,6 +31463,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: flat_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -25281,6 +31493,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -25306,6 +31524,12 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -25331,7 +31555,13 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: flat_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -25366,6 +31596,12 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -25402,6 +31638,12 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -25440,6 +31682,12 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -25474,6 +31722,12 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -25506,6 +31760,12 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -25528,6 +31788,12 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -25547,6 +31813,12 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -25568,6 +31840,12 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -25588,6 +31866,12 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -25612,6 +31896,12 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: flat_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -25636,6 +31926,12 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -25661,6 +31957,12 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -25686,7 +31988,13 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: flat_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll index 5815d22f958f..2bf272a74d0c 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll @@ -6,12 +6,16 @@ define amdgpu_kernel void @flat_last_use_load_0(ptr %in, ptr %out) { ; GFX12-LABEL: flat_last_use_load_0: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_LU +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -21,12 +25,16 @@ define amdgpu_kernel void @flat_last_use_load_0(ptr %in, ptr %out) { ; GFX1250-LABEL: flat_last_use_load_0: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] th:TH_LOAD_LU ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm entry: @@ -39,17 +47,20 @@ define amdgpu_kernel void @flat_last_use_load_1(ptr %in, ptr %out) { ; GFX12-LABEL: flat_last_use_load_1: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mov_b32 s2, 0x3ff ; GFX12-NEXT: v_and_b32_e64 v0, v0, s2 +; GFX12-NEXT: v_ashrrev_i32_e64 v2, 31, v0 +; GFX12-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX12-NEXT: v_mov_b32_e32 v1, v2 ; GFX12-NEXT: s_mov_b32 s2, 2 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-NEXT: v_lshlrev_b32_e64 v1, s2, v0 -; GFX12-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; GFX12-NEXT: v_mov_b32_e32 v2, v0 -; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_lshlrev_b64_e64 v[1:2], s2, v[0:1] ; GFX12-NEXT: s_mov_b32 s3, s4 ; GFX12-NEXT: v_mov_b32_e32 v0, v1 ; GFX12-NEXT: s_mov_b32 s2, s5 @@ -71,15 +82,19 @@ define amdgpu_kernel void @flat_last_use_load_1(ptr %in, ptr %out) { ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: v_mov_b32_e32 v1, v0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b32 s4, 0x3ff ; GFX1250-NEXT: v_and_b32_e64 v1, v1, s4 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_load_b32 v1, v1, s[2:3] scale_offset th:TH_LOAD_LU ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm entry: @@ -93,15 +108,19 @@ entry: define amdgpu_kernel void @flat_last_use_and_volatile_load(ptr %in, ptr %out) { ; GFX12-LABEL: flat_last_use_and_volatile_load: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_BYPASS scope:SCOPE_SYS ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -111,13 +130,17 @@ define amdgpu_kernel void @flat_last_use_and_volatile_load(ptr %in, ptr %out) { ; GFX1250-LABEL: flat_last_use_and_volatile_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] th:TH_LOAD_BYPASS scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm entry: @@ -129,12 +152,16 @@ entry: define amdgpu_kernel void @flat_last_use_and_nontemporal_load(ptr %in, ptr %out) { ; GFX12-LABEL: flat_last_use_and_nontemporal_load: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_LU +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -144,12 +171,16 @@ define amdgpu_kernel void @flat_last_use_and_nontemporal_load(ptr %in, ptr %out) ; GFX1250-LABEL: flat_last_use_and_nontemporal_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] th:TH_LOAD_LU ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll index d0bdd83686a6..c7530c59d0f3 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll @@ -19,15 +19,19 @@ define amdgpu_kernel void @flat_nontemporal_load_0( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] glc slc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -37,15 +41,19 @@ define amdgpu_kernel void @flat_nontemporal_load_0( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] slc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -55,29 +63,37 @@ define amdgpu_kernel void @flat_nontemporal_load_0( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] slc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_nontemporal_load_0: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] glc slc +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -85,13 +101,17 @@ define amdgpu_kernel void @flat_nontemporal_load_0( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc slc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -99,11 +119,15 @@ define amdgpu_kernel void @flat_nontemporal_load_0( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc slc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -111,23 +135,31 @@ define amdgpu_kernel void @flat_nontemporal_load_0( ; ; GFX942-NOTTGSPLIT-LABEL: flat_nontemporal_load_0: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] nt +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_nontemporal_load_0: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] nt +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -135,40 +167,52 @@ define amdgpu_kernel void @flat_nontemporal_load_0( ; ; GFX11-WGP-LABEL: flat_nontemporal_load_0: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] slc dlc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_nontemporal_load_0: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] slc dlc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_nontemporal_load_0: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -177,12 +221,16 @@ define amdgpu_kernel void @flat_nontemporal_load_0( ; ; GFX12-CU-LABEL: flat_nontemporal_load_0: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -192,12 +240,16 @@ define amdgpu_kernel void @flat_nontemporal_load_0( ; GFX1250-LABEL: flat_nontemporal_load_0: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { @@ -214,14 +266,17 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 -; GFX7-NEXT: s_mov_b32 s6, 2 -; GFX7-NEXT: v_lshlrev_b32_e64 v1, s6, v0 -; GFX7-NEXT: v_mov_b32_e32 v0, 0 -; GFX7-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_ashrrev_i32_e64 v2, 31, v0 +; GFX7-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 2 +; GFX7-NEXT: v_lshl_b64 v[1:2], v[0:1], s6 ; GFX7-NEXT: s_mov_b32 s6, s8 ; GFX7-NEXT: v_mov_b32_e32 v0, v1 ; GFX7-NEXT: s_mov_b32 s8, s9 @@ -245,15 +300,18 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_mov_b32 s6, 2 -; GFX10-WGP-NEXT: v_lshlrev_b32_e64 v1, s6, v0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-WGP-NEXT: v_ashrrev_i32_e64 v2, 31, v0 +; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v2 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_mov_b32 s6, 2 +; GFX10-WGP-NEXT: v_lshlrev_b64 v[1:2], s6, v[0:1] ; GFX10-WGP-NEXT: s_mov_b32 s7, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, v1 ; GFX10-WGP-NEXT: s_mov_b32 s6, s9 @@ -276,15 +334,18 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 -; GFX10-CU-NEXT: s_mov_b32 s6, 2 -; GFX10-CU-NEXT: v_lshlrev_b32_e64 v1, s6, v0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; GFX10-CU-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-CU-NEXT: v_ashrrev_i32_e64 v2, 31, v0 +; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v1, v2 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_mov_b32 s6, 2 +; GFX10-CU-NEXT: v_lshlrev_b64 v[1:2], s6, v[0:1] ; GFX10-CU-NEXT: s_mov_b32 s7, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, v1 ; GFX10-CU-NEXT: s_mov_b32 s6, s9 @@ -303,14 +364,17 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; SKIP-CACHE-INV-LABEL: flat_nontemporal_load_1: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, 2 -; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e64 v1, s2, v0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, 0 -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, v0 +; SKIP-CACHE-INV-NEXT: v_ashrrev_i32_e64 v2, 31, v0 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, 2 +; SKIP-CACHE-INV-NEXT: v_lshl_b64 v[1:2], v[0:1], s2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s5 @@ -332,17 +396,20 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s6, 0x3ff ; GFX90A-NOTTGSPLIT-NEXT: v_and_b32_e64 v0, v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_ashrrev_i32_e64 v2, 31, v0 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s6, 2 -; GFX90A-NOTTGSPLIT-NEXT: v_lshlrev_b32_e64 v2, s6, v0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_lshlrev_b64 v[2:3], s6, v[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s6, s8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s8, s9 @@ -363,17 +430,20 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_mov_b32 s6, 0x3ff ; GFX90A-TGSPLIT-NEXT: v_and_b32_e64 v0, v0, s6 +; GFX90A-TGSPLIT-NEXT: v_ashrrev_i32_e64 v2, 31, v0 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-TGSPLIT-NEXT: s_mov_b32 s6, 2 -; GFX90A-TGSPLIT-NEXT: v_lshlrev_b32_e64 v2, s6, v0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_lshlrev_b64 v[2:3], s6, v[0:1] ; GFX90A-TGSPLIT-NEXT: s_mov_b32 s6, s8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-TGSPLIT-NEXT: s_mov_b32 s8, s9 @@ -391,17 +461,22 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; ; GFX942-NOTTGSPLIT-LABEL: flat_nontemporal_load_1: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s4, 0x3ff -; GFX942-NOTTGSPLIT-NEXT: v_and_b32_e64 v0, v0, s4 -; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s4, 2 -; GFX942-NOTTGSPLIT-NEXT: v_lshlrev_b32_e64 v0, s4, v0 -; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s2, 0x3ff +; GFX942-NOTTGSPLIT-NEXT: v_and_b32_e64 v0, v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_ashrrev_i32_e64 v2, 31, v0 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NOTTGSPLIT-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s2, 2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX942-NOTTGSPLIT-NEXT: v_lshl_add_u64 v[0:1], v[0:1], s2, v[2:3] ; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] nt ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -410,17 +485,22 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; ; GFX942-TGSPLIT-LABEL: flat_nontemporal_load_1: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_mov_b32 s4, 0x3ff -; GFX942-TGSPLIT-NEXT: v_and_b32_e64 v0, v0, s4 -; GFX942-TGSPLIT-NEXT: s_mov_b32 s4, 2 -; GFX942-TGSPLIT-NEXT: v_lshlrev_b32_e64 v0, s4, v0 -; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_mov_b32 s2, 0x3ff +; GFX942-TGSPLIT-NEXT: v_and_b32_e64 v0, v0, s2 +; GFX942-TGSPLIT-NEXT: v_ashrrev_i32_e64 v2, 31, v0 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-TGSPLIT-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX942-TGSPLIT-NEXT: s_mov_b32 s2, 2 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX942-TGSPLIT-NEXT: v_lshl_add_u64 v[0:1], v[0:1], s2, v[2:3] ; GFX942-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] nt ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -430,16 +510,19 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX11-WGP-LABEL: flat_nontemporal_load_1: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_mov_b32 s2, 0x3ff ; GFX11-WGP-NEXT: v_and_b32_e64 v0, v0, s2 +; GFX11-WGP-NEXT: v_ashrrev_i32_e64 v2, 31, v0 +; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v2 ; GFX11-WGP-NEXT: s_mov_b32 s2, 2 -; GFX11-WGP-NEXT: v_lshlrev_b32_e64 v1, s2, v0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v0 -; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_lshlrev_b64 v[1:2], s2, v[0:1] ; GFX11-WGP-NEXT: s_mov_b32 s3, s4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, v1 ; GFX11-WGP-NEXT: s_mov_b32 s2, s5 @@ -458,16 +541,19 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX11-CU-LABEL: flat_nontemporal_load_1: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_mov_b32 s2, 0x3ff ; GFX11-CU-NEXT: v_and_b32_e64 v0, v0, s2 +; GFX11-CU-NEXT: v_ashrrev_i32_e64 v2, 31, v0 +; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v1, v2 ; GFX11-CU-NEXT: s_mov_b32 s2, 2 -; GFX11-CU-NEXT: v_lshlrev_b32_e64 v1, s2, v0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; GFX11-CU-NEXT: v_mov_b32_e32 v2, v0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_lshlrev_b64 v[1:2], s2, v[0:1] ; GFX11-CU-NEXT: s_mov_b32 s3, s4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, v1 ; GFX11-CU-NEXT: s_mov_b32 s2, s5 @@ -486,17 +572,20 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX12-WGP-LABEL: flat_nontemporal_load_1: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX12-WGP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_mov_b32 s2, 0x3ff ; GFX12-WGP-NEXT: v_and_b32_e64 v0, v0, s2 +; GFX12-WGP-NEXT: v_ashrrev_i32_e64 v2, 31, v0 +; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v2 ; GFX12-WGP-NEXT: s_mov_b32 s2, 2 ; GFX12-WGP-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-WGP-NEXT: v_lshlrev_b32_e64 v1, s2, v0 -; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v0 -; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_lshlrev_b64_e64 v[1:2], s2, v[0:1] ; GFX12-WGP-NEXT: s_mov_b32 s3, s4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, v1 ; GFX12-WGP-NEXT: s_mov_b32 s2, s5 @@ -517,17 +606,20 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX12-CU-LABEL: flat_nontemporal_load_1: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX12-CU-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_mov_b32 s2, 0x3ff ; GFX12-CU-NEXT: v_and_b32_e64 v0, v0, s2 +; GFX12-CU-NEXT: v_ashrrev_i32_e64 v2, 31, v0 +; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v1, v2 ; GFX12-CU-NEXT: s_mov_b32 s2, 2 ; GFX12-CU-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-CU-NEXT: v_lshlrev_b32_e64 v1, s2, v0 -; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; GFX12-CU-NEXT: v_mov_b32_e32 v2, v0 -; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_lshlrev_b64_e64 v[1:2], s2, v[0:1] ; GFX12-CU-NEXT: s_mov_b32 s3, s4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, v1 ; GFX12-CU-NEXT: s_mov_b32 s2, s5 @@ -549,15 +641,19 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: v_mov_b32_e32 v1, v0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b32 s4, 0x3ff ; GFX1250-NEXT: v_and_b32_e64 v1, v1, s4 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_load_b32 v1, v1, s[2:3] scale_offset th:TH_LOAD_NT ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { @@ -575,15 +671,19 @@ define amdgpu_kernel void @flat_nontemporal_store_0( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 glc slc ; GFX7-NEXT: s_endpgm ; @@ -593,15 +693,19 @@ define amdgpu_kernel void @flat_nontemporal_store_0( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 glc slc ; GFX10-WGP-NEXT: s_endpgm ; @@ -611,29 +715,37 @@ define amdgpu_kernel void @flat_nontemporal_store_0( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 glc slc ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_nontemporal_store_0: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 glc slc ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -641,13 +753,17 @@ define amdgpu_kernel void @flat_nontemporal_store_0( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 glc slc ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -655,11 +771,15 @@ define amdgpu_kernel void @flat_nontemporal_store_0( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 glc slc @@ -667,23 +787,31 @@ define amdgpu_kernel void @flat_nontemporal_store_0( ; ; GFX942-NOTTGSPLIT-LABEL: flat_nontemporal_store_0: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 nt ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_nontemporal_store_0: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 nt @@ -691,40 +819,52 @@ define amdgpu_kernel void @flat_nontemporal_store_0( ; ; GFX11-WGP-LABEL: flat_nontemporal_store_0: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 glc slc dlc ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_nontemporal_store_0: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 glc slc dlc ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_nontemporal_store_0: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -733,12 +873,16 @@ define amdgpu_kernel void @flat_nontemporal_store_0( ; ; GFX12-CU-LABEL: flat_nontemporal_store_0: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -748,12 +892,16 @@ define amdgpu_kernel void @flat_nontemporal_store_0( ; GFX1250-LABEL: flat_nontemporal_store_0: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] th:TH_STORE_NT ; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { @@ -770,16 +918,20 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-NEXT: flat_load_dword v2, v[1:2] +; GFX7-NEXT: v_ashrrev_i32_e64 v3, 31, v0 +; GFX7-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_mov_b32 s4, 2 -; GFX7-NEXT: v_lshlrev_b32_e64 v3, s4, v0 -; GFX7-NEXT: v_mov_b32_e32 v0, 0 -; GFX7-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec -; GFX7-NEXT: v_mov_b32_e32 v4, v0 +; GFX7-NEXT: v_lshl_b64 v[3:4], v[0:1], s4 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_mov_b32 s6, s7 @@ -800,16 +952,20 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-WGP-NEXT: flat_load_dword v2, v[1:2] +; GFX10-WGP-NEXT: v_ashrrev_i32_e64 v3, 31, v0 +; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3 ; GFX10-WGP-NEXT: s_mov_b32 s4, 2 -; GFX10-WGP-NEXT: v_lshlrev_b32_e64 v3, s4, v0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec -; GFX10-WGP-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-WGP-NEXT: v_lshlrev_b64 v[3:4], s4, v[0:1] ; GFX10-WGP-NEXT: s_mov_b32 s5, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-WGP-NEXT: s_mov_b32 s4, s7 @@ -829,16 +985,20 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-CU-NEXT: flat_load_dword v2, v[1:2] +; GFX10-CU-NEXT: v_ashrrev_i32_e64 v3, 31, v0 +; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3 ; GFX10-CU-NEXT: s_mov_b32 s4, 2 -; GFX10-CU-NEXT: v_lshlrev_b32_e64 v3, s4, v0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec -; GFX10-CU-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-CU-NEXT: v_lshlrev_b64 v[3:4], s4, v[0:1] ; GFX10-CU-NEXT: s_mov_b32 s5, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-CU-NEXT: s_mov_b32 s4, s7 @@ -854,16 +1014,20 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; SKIP-CACHE-INV-LABEL: flat_nontemporal_store_1: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[1:2] +; SKIP-CACHE-INV-NEXT: v_ashrrev_i32_e64 v3, 31, v0 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v3 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, 2 -; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e64 v3, s0, v0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, 0 -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v4, v0 +; SKIP-CACHE-INV-NEXT: v_lshl_b64 v[3:4], v[0:1], s0 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, v3 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s3 @@ -882,17 +1046,21 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s4, 0x3ff ; GFX90A-NOTTGSPLIT-NEXT: v_and_b32_e64 v0, v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: v_ashrrev_i32_e64 v3, 31, v0 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s4, 2 -; GFX90A-NOTTGSPLIT-NEXT: v_lshlrev_b32_e64 v4, s4, v0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_lshlrev_b64 v[4:5], s4, v[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s4, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s6, s7 @@ -911,17 +1079,21 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[2:3] ; GFX90A-TGSPLIT-NEXT: s_mov_b32 s4, 0x3ff ; GFX90A-TGSPLIT-NEXT: v_and_b32_e64 v0, v0, s4 +; GFX90A-TGSPLIT-NEXT: v_ashrrev_i32_e64 v3, 31, v0 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3 ; GFX90A-TGSPLIT-NEXT: s_mov_b32 s4, 2 -; GFX90A-TGSPLIT-NEXT: v_lshlrev_b32_e64 v4, s4, v0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-TGSPLIT-NEXT: v_lshlrev_b64 v[4:5], s4, v[0:1] ; GFX90A-TGSPLIT-NEXT: s_mov_b32 s4, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-TGSPLIT-NEXT: s_mov_b32 s6, s7 @@ -937,38 +1109,46 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; ; GFX942-NOTTGSPLIT-LABEL: flat_nontemporal_store_1: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[2:3] -; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s2, 0x3ff -; GFX942-NOTTGSPLIT-NEXT: v_and_b32_e64 v0, v0, s2 -; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s2, 2 -; GFX942-NOTTGSPLIT-NEXT: v_lshlrev_b32_e64 v0, s2, v0 -; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s0, 0x3ff +; GFX942-NOTTGSPLIT-NEXT: v_and_b32_e64 v0, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: v_ashrrev_i32_e64 v3, 31, v0 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NOTTGSPLIT-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s0, 2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[4:5], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: v_lshl_add_u64 v[0:1], v[0:1], s0, v[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 nt ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_nontemporal_store_1: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX942-TGSPLIT-NEXT: flat_load_dword v2, v[2:3] -; GFX942-TGSPLIT-NEXT: s_mov_b32 s2, 0x3ff -; GFX942-TGSPLIT-NEXT: v_and_b32_e64 v0, v0, s2 -; GFX942-TGSPLIT-NEXT: s_mov_b32 s2, 2 -; GFX942-TGSPLIT-NEXT: v_lshlrev_b32_e64 v0, s2, v0 -; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-TGSPLIT-NEXT: s_mov_b32 s0, 0x3ff +; GFX942-TGSPLIT-NEXT: v_and_b32_e64 v0, v0, s0 +; GFX942-TGSPLIT-NEXT: v_ashrrev_i32_e64 v3, 31, v0 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-TGSPLIT-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1] +; GFX942-TGSPLIT-NEXT: s_mov_b32 s0, 2 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[4:5], s[2:3] +; GFX942-TGSPLIT-NEXT: v_lshl_add_u64 v[0:1], v[0:1], s0, v[4:5] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 nt ; GFX942-TGSPLIT-NEXT: s_endpgm @@ -976,6 +1156,10 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX11-WGP-LABEL: flat_nontemporal_store_1: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 @@ -983,11 +1167,11 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX11-WGP-NEXT: flat_load_b32 v2, v[1:2] ; GFX11-WGP-NEXT: s_mov_b32 s0, 0x3ff ; GFX11-WGP-NEXT: v_and_b32_e64 v0, v0, s0 +; GFX11-WGP-NEXT: v_ashrrev_i32_e64 v3, 31, v0 +; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3 ; GFX11-WGP-NEXT: s_mov_b32 s0, 2 -; GFX11-WGP-NEXT: v_lshlrev_b32_e64 v3, s0, v0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec -; GFX11-WGP-NEXT: v_mov_b32_e32 v4, v0 +; GFX11-WGP-NEXT: v_lshlrev_b64 v[3:4], s0, v[0:1] ; GFX11-WGP-NEXT: s_mov_b32 s1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-WGP-NEXT: s_mov_b32 s0, s3 @@ -1003,6 +1187,10 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX11-CU-LABEL: flat_nontemporal_store_1: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 @@ -1010,11 +1198,11 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX11-CU-NEXT: flat_load_b32 v2, v[1:2] ; GFX11-CU-NEXT: s_mov_b32 s0, 0x3ff ; GFX11-CU-NEXT: v_and_b32_e64 v0, v0, s0 +; GFX11-CU-NEXT: v_ashrrev_i32_e64 v3, 31, v0 +; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3 ; GFX11-CU-NEXT: s_mov_b32 s0, 2 -; GFX11-CU-NEXT: v_lshlrev_b32_e64 v3, s0, v0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec -; GFX11-CU-NEXT: v_mov_b32_e32 v4, v0 +; GFX11-CU-NEXT: v_lshlrev_b64 v[3:4], s0, v[0:1] ; GFX11-CU-NEXT: s_mov_b32 s1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-CU-NEXT: s_mov_b32 s0, s3 @@ -1030,6 +1218,10 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX12-WGP-LABEL: flat_nontemporal_store_1: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 @@ -1038,12 +1230,12 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX12-WGP-NEXT: s_mov_b32 s0, 0x3ff ; GFX12-WGP-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-WGP-NEXT: v_and_b32_e64 v0, v0, s0 +; GFX12-WGP-NEXT: v_ashrrev_i32_e64 v3, 31, v0 +; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3 ; GFX12-WGP-NEXT: s_mov_b32 s0, 2 ; GFX12-WGP-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-WGP-NEXT: v_lshlrev_b32_e64 v3, s0, v0 -; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec -; GFX12-WGP-NEXT: v_mov_b32_e32 v4, v0 +; GFX12-WGP-NEXT: v_lshlrev_b64_e64 v[3:4], s0, v[0:1] ; GFX12-WGP-NEXT: s_mov_b32 s1, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-WGP-NEXT: s_mov_b32 s0, s3 @@ -1061,6 +1253,10 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX12-CU-LABEL: flat_nontemporal_store_1: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1 @@ -1069,12 +1265,12 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX12-CU-NEXT: s_mov_b32 s0, 0x3ff ; GFX12-CU-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-CU-NEXT: v_and_b32_e64 v0, v0, s0 +; GFX12-CU-NEXT: v_ashrrev_i32_e64 v3, 31, v0 +; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3 ; GFX12-CU-NEXT: s_mov_b32 s0, 2 ; GFX12-CU-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-CU-NEXT: v_lshlrev_b32_e64 v3, s0, v0 -; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec -; GFX12-CU-NEXT: v_mov_b32_e32 v4, v0 +; GFX12-CU-NEXT: v_lshlrev_b64_e64 v[3:4], s0, v[0:1] ; GFX12-CU-NEXT: s_mov_b32 s1, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-CU-NEXT: s_mov_b32 s0, s3 @@ -1092,15 +1288,19 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX1250-LABEL: flat_nontemporal_store_1: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: flat_load_b32 v1, v1, s[2:3] ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b32 s2, 0x3ff ; GFX1250-NEXT: v_and_b32_e64 v0, v0, s2 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scale_offset th:TH_STORE_NT ; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { @@ -1118,16 +1318,18 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] glc -; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -1137,16 +1339,18 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc dlc -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -1156,31 +1360,35 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] glc dlc -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_nontemporal_volatile_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] glc -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1188,14 +1396,16 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1203,82 +1413,98 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_nontemporal_volatile_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 sc1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_nontemporal_volatile_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 sc1 -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_nontemporal_volatile_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc dlc -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_nontemporal_volatile_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc dlc -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_nontemporal_volatile_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0 @@ -1287,15 +1513,19 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load( ; ; GFX12-CU-LABEL: flat_nontemporal_volatile_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 @@ -1305,13 +1535,17 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load( ; GFX1250-LABEL: flat_nontemporal_volatile_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll index 3b97aa0afce1..d1373a108714 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll @@ -19,15 +19,19 @@ define amdgpu_kernel void @flat_singlethread_unordered_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -37,15 +41,19 @@ define amdgpu_kernel void @flat_singlethread_unordered_load( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -55,29 +63,37 @@ define amdgpu_kernel void @flat_singlethread_unordered_load( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_unordered_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -85,13 +101,17 @@ define amdgpu_kernel void @flat_singlethread_unordered_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -99,11 +119,15 @@ define amdgpu_kernel void @flat_singlethread_unordered_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -111,23 +135,31 @@ define amdgpu_kernel void @flat_singlethread_unordered_load( ; ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_unordered_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_unordered_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -135,40 +167,52 @@ define amdgpu_kernel void @flat_singlethread_unordered_load( ; ; GFX11-WGP-LABEL: flat_singlethread_unordered_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_unordered_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_singlethread_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -177,12 +221,16 @@ define amdgpu_kernel void @flat_singlethread_unordered_load( ; ; GFX12-CU-LABEL: flat_singlethread_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -192,12 +240,16 @@ define amdgpu_kernel void @flat_singlethread_unordered_load( ; GFX1250-LABEL: flat_singlethread_unordered_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { @@ -213,15 +265,19 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -231,15 +287,19 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -249,29 +309,37 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -279,13 +347,17 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -293,11 +365,15 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -305,23 +381,31 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load( ; ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_monotonic_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -329,40 +413,52 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load( ; ; GFX11-WGP-LABEL: flat_singlethread_monotonic_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_monotonic_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_singlethread_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -371,12 +467,16 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load( ; ; GFX12-CU-LABEL: flat_singlethread_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -386,12 +486,16 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load( ; GFX1250-LABEL: flat_singlethread_monotonic_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { @@ -407,15 +511,19 @@ define amdgpu_kernel void @flat_singlethread_acquire_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -425,15 +533,19 @@ define amdgpu_kernel void @flat_singlethread_acquire_load( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -443,29 +555,37 @@ define amdgpu_kernel void @flat_singlethread_acquire_load( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -473,13 +593,17 @@ define amdgpu_kernel void @flat_singlethread_acquire_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -487,11 +611,15 @@ define amdgpu_kernel void @flat_singlethread_acquire_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -499,23 +627,31 @@ define amdgpu_kernel void @flat_singlethread_acquire_load( ; ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_acquire_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_acquire_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -523,40 +659,52 @@ define amdgpu_kernel void @flat_singlethread_acquire_load( ; ; GFX11-WGP-LABEL: flat_singlethread_acquire_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_singlethread_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -565,12 +713,16 @@ define amdgpu_kernel void @flat_singlethread_acquire_load( ; ; GFX12-CU-LABEL: flat_singlethread_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -580,12 +732,16 @@ define amdgpu_kernel void @flat_singlethread_acquire_load( ; GFX1250-LABEL: flat_singlethread_acquire_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { @@ -601,15 +757,19 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -619,15 +779,19 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -637,29 +801,37 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -667,13 +839,17 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -681,11 +857,15 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -693,23 +873,31 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load( ; ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_seq_cst_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -717,40 +905,52 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load( ; ; GFX11-WGP-LABEL: flat_singlethread_seq_cst_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_seq_cst_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_singlethread_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -759,12 +959,16 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load( ; ; GFX12-CU-LABEL: flat_singlethread_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -774,12 +978,16 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load( ; GFX1250-LABEL: flat_singlethread_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { @@ -796,6 +1004,10 @@ define amdgpu_kernel void @flat_singlethread_unordered_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -811,6 +1023,10 @@ define amdgpu_kernel void @flat_singlethread_unordered_store( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 @@ -826,6 +1042,10 @@ define amdgpu_kernel void @flat_singlethread_unordered_store( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 @@ -837,6 +1057,10 @@ define amdgpu_kernel void @flat_singlethread_unordered_store( ; SKIP-CACHE-INV-LABEL: flat_singlethread_unordered_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -850,6 +1074,10 @@ define amdgpu_kernel void @flat_singlethread_unordered_store( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -862,6 +1090,10 @@ define amdgpu_kernel void @flat_singlethread_unordered_store( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -872,6 +1104,10 @@ define amdgpu_kernel void @flat_singlethread_unordered_store( ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_unordered_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -882,6 +1118,10 @@ define amdgpu_kernel void @flat_singlethread_unordered_store( ; GFX942-TGSPLIT-LABEL: flat_singlethread_unordered_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -892,6 +1132,10 @@ define amdgpu_kernel void @flat_singlethread_unordered_store( ; GFX11-WGP-LABEL: flat_singlethread_unordered_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -903,6 +1147,10 @@ define amdgpu_kernel void @flat_singlethread_unordered_store( ; GFX11-CU-LABEL: flat_singlethread_unordered_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -914,6 +1162,10 @@ define amdgpu_kernel void @flat_singlethread_unordered_store( ; GFX12-WGP-LABEL: flat_singlethread_unordered_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -925,6 +1177,10 @@ define amdgpu_kernel void @flat_singlethread_unordered_store( ; GFX12-CU-LABEL: flat_singlethread_unordered_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -936,12 +1192,16 @@ define amdgpu_kernel void @flat_singlethread_unordered_store( ; GFX1250-LABEL: flat_singlethread_unordered_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { @@ -957,6 +1217,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -972,6 +1236,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 @@ -987,6 +1255,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 @@ -998,6 +1270,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store( ; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -1011,6 +1287,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -1023,6 +1303,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -1033,6 +1317,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store( ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -1043,6 +1331,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store( ; GFX942-TGSPLIT-LABEL: flat_singlethread_monotonic_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -1053,6 +1345,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store( ; GFX11-WGP-LABEL: flat_singlethread_monotonic_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -1064,6 +1360,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store( ; GFX11-CU-LABEL: flat_singlethread_monotonic_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -1075,6 +1375,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store( ; GFX12-WGP-LABEL: flat_singlethread_monotonic_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -1086,6 +1390,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store( ; GFX12-CU-LABEL: flat_singlethread_monotonic_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -1097,12 +1405,16 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store( ; GFX1250-LABEL: flat_singlethread_monotonic_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { @@ -1118,6 +1430,10 @@ define amdgpu_kernel void @flat_singlethread_release_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -1133,6 +1449,10 @@ define amdgpu_kernel void @flat_singlethread_release_store( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 @@ -1148,6 +1468,10 @@ define amdgpu_kernel void @flat_singlethread_release_store( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 @@ -1159,6 +1483,10 @@ define amdgpu_kernel void @flat_singlethread_release_store( ; SKIP-CACHE-INV-LABEL: flat_singlethread_release_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -1172,6 +1500,10 @@ define amdgpu_kernel void @flat_singlethread_release_store( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -1184,6 +1516,10 @@ define amdgpu_kernel void @flat_singlethread_release_store( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -1194,6 +1530,10 @@ define amdgpu_kernel void @flat_singlethread_release_store( ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_release_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -1204,6 +1544,10 @@ define amdgpu_kernel void @flat_singlethread_release_store( ; GFX942-TGSPLIT-LABEL: flat_singlethread_release_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -1214,6 +1558,10 @@ define amdgpu_kernel void @flat_singlethread_release_store( ; GFX11-WGP-LABEL: flat_singlethread_release_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -1225,6 +1573,10 @@ define amdgpu_kernel void @flat_singlethread_release_store( ; GFX11-CU-LABEL: flat_singlethread_release_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -1236,6 +1588,10 @@ define amdgpu_kernel void @flat_singlethread_release_store( ; GFX12-WGP-LABEL: flat_singlethread_release_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -1247,6 +1603,10 @@ define amdgpu_kernel void @flat_singlethread_release_store( ; GFX12-CU-LABEL: flat_singlethread_release_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -1258,12 +1618,16 @@ define amdgpu_kernel void @flat_singlethread_release_store( ; GFX1250-LABEL: flat_singlethread_release_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { @@ -1279,6 +1643,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -1294,6 +1662,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 @@ -1309,6 +1681,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 @@ -1320,6 +1696,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store( ; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -1333,6 +1713,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -1345,6 +1729,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -1355,6 +1743,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store( ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -1365,6 +1757,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store( ; GFX942-TGSPLIT-LABEL: flat_singlethread_seq_cst_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -1375,6 +1771,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store( ; GFX11-WGP-LABEL: flat_singlethread_seq_cst_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -1386,6 +1786,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store( ; GFX11-CU-LABEL: flat_singlethread_seq_cst_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -1397,6 +1801,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store( ; GFX12-WGP-LABEL: flat_singlethread_seq_cst_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -1408,6 +1816,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store( ; GFX12-CU-LABEL: flat_singlethread_seq_cst_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -1419,12 +1831,16 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store( ; GFX1250-LABEL: flat_singlethread_seq_cst_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { @@ -1439,11 +1855,15 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm @@ -1454,11 +1874,15 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm @@ -1469,22 +1893,30 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -1493,10 +1925,14 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -1505,74 +1941,102 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_monotonic_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_singlethread_monotonic_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_monotonic_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_singlethread_monotonic_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_singlethread_monotonic_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm @@ -1580,7 +2044,11 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw( ; GFX1250-LABEL: flat_singlethread_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -1600,11 +2068,15 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm @@ -1615,11 +2087,15 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm @@ -1630,22 +2106,30 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -1654,10 +2138,14 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -1666,74 +2154,102 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_acquire_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_acquire_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_singlethread_acquire_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_acquire_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_singlethread_acquire_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_singlethread_acquire_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm @@ -1741,7 +2257,11 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw( ; GFX1250-LABEL: flat_singlethread_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -1761,11 +2281,15 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm @@ -1776,11 +2300,15 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm @@ -1791,22 +2319,30 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_release_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -1815,10 +2351,14 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -1827,74 +2367,102 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_release_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_release_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_singlethread_release_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_release_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_singlethread_release_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_singlethread_release_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm @@ -1902,7 +2470,11 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw( ; GFX1250-LABEL: flat_singlethread_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -1922,11 +2494,15 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm @@ -1937,11 +2513,15 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm @@ -1952,22 +2532,30 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -1976,10 +2564,14 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -1988,74 +2580,102 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_acq_rel_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_singlethread_acq_rel_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_acq_rel_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_singlethread_acq_rel_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_singlethread_acq_rel_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm @@ -2063,7 +2683,11 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw( ; GFX1250-LABEL: flat_singlethread_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2083,11 +2707,15 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm @@ -2098,11 +2726,15 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm @@ -2113,22 +2745,30 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -2137,10 +2777,14 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -2149,74 +2793,102 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_seq_cst_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_singlethread_seq_cst_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_seq_cst_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_singlethread_seq_cst_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_singlethread_seq_cst_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm @@ -2224,7 +2896,11 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw( ; GFX1250-LABEL: flat_singlethread_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2245,6 +2921,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -2264,6 +2944,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -2283,6 +2967,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -2298,6 +2986,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -2315,6 +3007,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] @@ -2330,6 +3026,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] @@ -2343,6 +3043,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_acquire_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -2356,6 +3060,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: flat_singlethread_acquire_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -2369,6 +3077,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw( ; GFX11-WGP-LABEL: flat_singlethread_acquire_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -2384,6 +3096,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw( ; GFX11-CU-LABEL: flat_singlethread_acquire_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2399,6 +3115,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw( ; GFX12-WGP-LABEL: flat_singlethread_acquire_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -2414,6 +3134,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw( ; GFX12-CU-LABEL: flat_singlethread_acquire_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2429,7 +3153,11 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw( ; GFX1250-LABEL: flat_singlethread_acquire_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2453,6 +3181,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -2472,6 +3204,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -2491,6 +3227,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -2506,6 +3246,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -2523,6 +3267,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] @@ -2538,6 +3286,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] @@ -2551,6 +3303,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -2564,6 +3320,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: flat_singlethread_acq_rel_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -2577,6 +3337,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw( ; GFX11-WGP-LABEL: flat_singlethread_acq_rel_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -2592,6 +3356,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw( ; GFX11-CU-LABEL: flat_singlethread_acq_rel_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2607,6 +3375,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw( ; GFX12-WGP-LABEL: flat_singlethread_acq_rel_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -2622,6 +3394,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw( ; GFX12-CU-LABEL: flat_singlethread_acq_rel_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2637,7 +3413,11 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw( ; GFX1250-LABEL: flat_singlethread_acq_rel_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2661,6 +3441,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -2680,6 +3464,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -2699,6 +3487,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -2714,6 +3506,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -2731,6 +3527,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] @@ -2746,6 +3546,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] @@ -2759,6 +3563,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -2772,6 +3580,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: flat_singlethread_seq_cst_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -2785,6 +3597,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw( ; GFX11-WGP-LABEL: flat_singlethread_seq_cst_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -2800,6 +3616,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw( ; GFX11-CU-LABEL: flat_singlethread_seq_cst_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2815,6 +3635,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw( ; GFX12-WGP-LABEL: flat_singlethread_seq_cst_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -2830,6 +3654,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw( ; GFX12-CU-LABEL: flat_singlethread_seq_cst_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2845,7 +3673,11 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw( ; GFX1250-LABEL: flat_singlethread_seq_cst_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2869,11 +3701,16 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -2883,6 +3720,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -2898,11 +3736,16 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -2912,6 +3755,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -2927,11 +3771,16 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -2941,6 +3790,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -2952,11 +3802,16 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -2966,6 +3821,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -2979,6 +3835,12 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2995,6 +3857,12 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3009,6 +3877,12 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3023,6 +3897,12 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3037,6 +3917,12 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg( ; GFX11-WGP-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -3052,6 +3938,12 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg( ; GFX11-CU-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -3067,6 +3959,12 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -3082,6 +3980,12 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -3097,7 +4001,13 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg( ; GFX1250-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -3123,11 +4033,16 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -3137,6 +4052,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -3152,11 +4068,16 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -3166,6 +4087,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -3181,11 +4103,16 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -3195,6 +4122,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -3206,11 +4134,16 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -3220,6 +4153,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -3233,6 +4167,12 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3249,6 +4189,12 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3263,6 +4209,12 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3277,6 +4229,12 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3291,6 +4249,12 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg( ; GFX11-WGP-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -3306,6 +4270,12 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg( ; GFX11-CU-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -3321,6 +4291,12 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -3336,6 +4312,12 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -3351,7 +4333,13 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg( ; GFX1250-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -3377,11 +4365,16 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -3391,6 +4384,7 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -3406,11 +4400,16 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -3420,6 +4419,7 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -3435,11 +4435,16 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -3449,6 +4454,7 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -3460,11 +4466,16 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_singlethread_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -3474,6 +4485,7 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -3487,6 +4499,12 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3503,6 +4521,12 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3517,6 +4541,12 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_release_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3531,6 +4561,12 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_singlethread_release_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3545,6 +4581,12 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg( ; GFX11-WGP-LABEL: flat_singlethread_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -3560,6 +4602,12 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg( ; GFX11-CU-LABEL: flat_singlethread_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -3575,6 +4623,12 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -3590,6 +4644,12 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -3605,7 +4665,13 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg( ; GFX1250-LABEL: flat_singlethread_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -3631,11 +4697,16 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -3645,6 +4716,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -3660,11 +4732,16 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -3674,6 +4751,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -3689,11 +4767,16 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -3703,6 +4786,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -3714,11 +4798,16 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -3728,6 +4817,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -3741,6 +4831,12 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3757,6 +4853,12 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3771,6 +4873,12 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3785,6 +4893,12 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3799,6 +4913,12 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg( ; GFX11-WGP-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -3814,6 +4934,12 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg( ; GFX11-CU-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -3829,6 +4955,12 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -3844,6 +4976,12 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -3859,7 +4997,13 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg( ; GFX1250-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -3885,11 +5029,16 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -3899,6 +5048,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -3914,11 +5064,16 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -3928,6 +5083,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -3943,11 +5099,16 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -3957,6 +5118,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -3968,11 +5130,16 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -3982,6 +5149,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -3995,6 +5163,12 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4011,6 +5185,12 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4025,6 +5205,12 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4039,6 +5225,12 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4053,6 +5245,12 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg( ; GFX11-WGP-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4068,6 +5266,12 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg( ; GFX11-CU-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4083,6 +5287,12 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -4098,6 +5308,12 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -4113,7 +5329,13 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg( ; GFX1250-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -4139,11 +5361,16 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -4153,6 +5380,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -4168,11 +5396,16 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -4182,6 +5415,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -4197,11 +5431,16 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -4211,6 +5450,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -4222,11 +5462,16 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -4236,6 +5481,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -4249,6 +5495,12 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4265,6 +5517,12 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4279,6 +5537,12 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4293,6 +5557,12 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4307,6 +5577,12 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg( ; GFX11-WGP-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4322,6 +5598,12 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg( ; GFX11-CU-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4337,6 +5619,12 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -4352,6 +5640,12 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -4367,7 +5661,13 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg( ; GFX1250-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -4393,11 +5693,16 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -4407,6 +5712,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -4422,11 +5728,16 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -4436,6 +5747,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -4451,11 +5763,16 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -4465,6 +5782,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -4476,11 +5794,16 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -4490,6 +5813,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -4503,6 +5827,12 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4519,6 +5849,12 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4533,6 +5869,12 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_acquire_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4547,6 +5889,12 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_singlethread_acquire_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4561,6 +5909,12 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg( ; GFX11-WGP-LABEL: flat_singlethread_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4576,6 +5930,12 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg( ; GFX11-CU-LABEL: flat_singlethread_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4591,6 +5951,12 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -4606,6 +5972,12 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -4621,7 +5993,13 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg( ; GFX1250-LABEL: flat_singlethread_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -4647,11 +6025,16 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -4661,6 +6044,7 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -4676,11 +6060,16 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -4690,6 +6079,7 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -4705,11 +6095,16 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -4719,6 +6114,7 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -4730,11 +6126,16 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_singlethread_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -4744,6 +6145,7 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -4757,6 +6159,12 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4773,6 +6181,12 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4787,6 +6201,12 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_release_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4801,6 +6221,12 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_singlethread_release_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4815,6 +6241,12 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg( ; GFX11-WGP-LABEL: flat_singlethread_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4830,6 +6262,12 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg( ; GFX11-CU-LABEL: flat_singlethread_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4845,6 +6283,12 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -4860,6 +6304,12 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -4875,7 +6325,13 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg( ; GFX1250-LABEL: flat_singlethread_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -4901,11 +6357,16 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -4915,6 +6376,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -4930,11 +6392,16 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -4944,6 +6411,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -4959,11 +6427,16 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -4973,6 +6446,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -4984,11 +6458,16 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -4998,6 +6477,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -5011,6 +6491,12 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5027,6 +6513,12 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5041,6 +6533,12 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5055,6 +6553,12 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5069,6 +6573,12 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg( ; GFX11-WGP-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5084,6 +6594,12 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg( ; GFX11-CU-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5099,6 +6615,12 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -5114,6 +6636,12 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -5129,7 +6657,13 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg( ; GFX1250-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -5155,11 +6689,16 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -5169,6 +6708,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -5184,11 +6724,16 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -5198,6 +6743,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -5213,11 +6759,16 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -5227,6 +6778,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -5238,11 +6790,16 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -5252,6 +6809,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -5265,6 +6823,12 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5281,6 +6845,12 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5295,6 +6865,12 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5309,6 +6885,12 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5323,6 +6905,12 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg( ; GFX11-WGP-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5338,6 +6926,12 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg( ; GFX11-CU-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5353,6 +6947,12 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -5368,6 +6968,12 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -5383,7 +6989,13 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg( ; GFX1250-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -5409,11 +7021,16 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -5423,6 +7040,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -5438,11 +7056,16 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -5452,6 +7075,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -5467,11 +7091,16 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -5481,6 +7110,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -5492,11 +7122,16 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -5506,6 +7141,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -5519,6 +7155,12 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5535,6 +7177,12 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5549,6 +7197,12 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5563,6 +7217,12 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5577,6 +7237,12 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5592,6 +7258,12 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg( ; GFX11-CU-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5607,6 +7279,12 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -5622,6 +7300,12 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -5637,7 +7321,13 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg( ; GFX1250-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -5663,11 +7353,16 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -5677,6 +7372,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -5692,11 +7388,16 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -5706,6 +7407,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -5721,11 +7423,16 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -5735,6 +7442,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -5746,11 +7454,16 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -5760,6 +7473,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -5773,6 +7487,12 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5789,6 +7509,12 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5803,6 +7529,12 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5817,6 +7549,12 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5831,6 +7569,12 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5846,6 +7590,12 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg( ; GFX11-CU-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5861,6 +7611,12 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -5876,6 +7632,12 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -5891,7 +7653,13 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg( ; GFX1250-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -5917,11 +7685,16 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -5931,6 +7704,7 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -5946,11 +7720,16 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -5960,6 +7739,7 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -5975,11 +7755,16 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -5989,6 +7774,7 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -6000,11 +7786,16 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_singlethread_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -6014,6 +7805,7 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -6027,6 +7819,12 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6043,6 +7841,12 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6057,6 +7861,12 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_release_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6071,6 +7881,12 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_singlethread_release_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6085,6 +7901,12 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: flat_singlethread_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -6100,6 +7922,12 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg( ; GFX11-CU-LABEL: flat_singlethread_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6115,6 +7943,12 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -6130,6 +7964,12 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -6145,7 +7985,13 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg( ; GFX1250-LABEL: flat_singlethread_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -6171,11 +8017,16 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -6185,6 +8036,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -6200,11 +8052,16 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -6214,6 +8071,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -6229,11 +8087,16 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -6243,6 +8106,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -6254,11 +8118,16 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -6268,6 +8137,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -6281,6 +8151,12 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6297,6 +8173,12 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6311,6 +8193,12 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6325,6 +8213,12 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6339,6 +8233,12 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -6354,6 +8254,12 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX11-CU-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6369,6 +8275,12 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -6384,6 +8296,12 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -6399,7 +8317,13 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX1250-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -6425,11 +8349,16 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -6439,6 +8368,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -6454,11 +8384,16 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -6468,6 +8403,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -6483,11 +8419,16 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -6497,6 +8438,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -6508,11 +8450,16 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -6522,6 +8469,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -6535,6 +8483,12 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6551,6 +8505,12 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6565,6 +8525,12 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6579,6 +8545,12 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6593,6 +8565,12 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -6608,6 +8586,12 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX11-CU-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6623,6 +8607,12 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -6638,6 +8628,12 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -6653,7 +8649,13 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX1250-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -6680,6 +8682,12 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -6713,6 +8721,12 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -6746,6 +8760,12 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -6775,6 +8795,12 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -6805,6 +8831,12 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6824,6 +8856,12 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6841,6 +8879,12 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6858,6 +8902,12 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6875,6 +8925,12 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -6894,6 +8950,12 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6913,6 +8975,12 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -6932,6 +9000,12 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -6951,7 +9025,13 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg( ; GFX1250-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -6982,6 +9062,12 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -7015,6 +9101,12 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -7048,6 +9140,12 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -7077,6 +9175,12 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -7107,6 +9211,12 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7126,6 +9236,12 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7143,6 +9259,12 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7160,6 +9282,12 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7177,6 +9305,12 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7196,6 +9330,12 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -7215,6 +9355,12 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -7234,6 +9380,12 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -7253,7 +9405,13 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX1250-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -7284,6 +9442,12 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -7317,6 +9481,12 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -7350,6 +9520,12 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -7379,6 +9555,12 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -7409,6 +9591,12 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7428,6 +9616,12 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7445,6 +9639,12 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7462,6 +9662,12 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7479,6 +9685,12 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7498,6 +9710,12 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -7517,6 +9735,12 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -7536,6 +9760,12 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -7555,7 +9785,13 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg( ; GFX1250-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -7586,6 +9822,12 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -7619,6 +9861,12 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -7652,6 +9900,12 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -7681,6 +9935,12 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -7711,6 +9971,12 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7730,6 +9996,12 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7747,6 +10019,12 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7764,6 +10042,12 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7781,6 +10065,12 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7800,6 +10090,12 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -7819,6 +10115,12 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -7838,6 +10140,12 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -7857,7 +10165,13 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX1250-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -7888,6 +10202,12 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -7921,6 +10241,12 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -7954,6 +10280,12 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -7983,6 +10315,12 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -8013,6 +10351,12 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8032,6 +10376,12 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8049,6 +10399,12 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8066,6 +10422,12 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8083,6 +10445,12 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -8102,6 +10470,12 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -8121,6 +10495,12 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -8140,6 +10520,12 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -8159,7 +10545,13 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX1250-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -8190,6 +10582,12 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -8223,6 +10621,12 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -8256,6 +10660,12 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -8285,6 +10695,12 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -8315,6 +10731,12 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8334,6 +10756,12 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8351,6 +10779,12 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8368,6 +10802,12 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8385,6 +10825,12 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -8404,6 +10850,12 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -8423,6 +10875,12 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -8442,6 +10900,12 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -8461,7 +10925,13 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX1250-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -8492,6 +10962,12 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -8525,6 +11001,12 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -8558,6 +11040,12 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -8587,6 +11075,12 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -8617,6 +11111,12 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8636,6 +11136,12 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8653,6 +11159,12 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8670,6 +11182,12 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8687,6 +11205,12 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -8706,6 +11230,12 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -8725,6 +11255,12 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -8744,6 +11280,12 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -8763,7 +11305,13 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg( ; GFX1250-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -8794,6 +11342,12 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -8827,6 +11381,12 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -8860,6 +11420,12 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -8889,6 +11455,12 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -8919,6 +11491,12 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8938,6 +11516,12 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8955,6 +11539,12 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8972,6 +11562,12 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8989,6 +11585,12 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -9008,6 +11610,12 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -9027,6 +11635,12 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -9046,6 +11660,12 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -9065,7 +11685,13 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg( ; GFX1250-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -9096,6 +11722,12 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -9129,6 +11761,12 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -9162,6 +11800,12 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -9191,6 +11835,12 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -9221,6 +11871,12 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9240,6 +11896,12 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9257,6 +11919,12 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9274,6 +11942,12 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9291,6 +11965,12 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -9310,6 +11990,12 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -9329,6 +12015,12 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -9348,6 +12040,12 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -9367,7 +12065,13 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX1250-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -9398,6 +12102,12 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -9431,6 +12141,12 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -9464,6 +12180,12 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -9493,6 +12215,12 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -9523,6 +12251,12 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9542,6 +12276,12 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9559,6 +12299,12 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9576,6 +12322,12 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9593,6 +12345,12 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -9612,6 +12370,12 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -9631,6 +12395,12 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -9650,6 +12420,12 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -9669,7 +12445,13 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX1250-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -9700,6 +12482,12 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -9733,6 +12521,12 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -9766,6 +12560,12 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -9795,6 +12595,12 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -9825,6 +12631,12 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9844,6 +12656,12 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9861,6 +12679,12 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9878,6 +12702,12 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9895,6 +12725,12 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -9914,6 +12750,12 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -9933,6 +12775,12 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -9952,6 +12800,12 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -9971,7 +12825,13 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -10002,6 +12862,12 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -10035,6 +12901,12 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -10068,6 +12940,12 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -10097,6 +12975,12 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -10127,6 +13011,12 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10146,6 +13036,12 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10163,6 +13059,12 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10180,6 +13082,12 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10197,6 +13105,12 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10216,6 +13130,12 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10235,6 +13155,12 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -10254,6 +13180,12 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -10273,7 +13205,13 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -10304,6 +13242,12 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -10337,6 +13281,12 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -10370,6 +13320,12 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -10399,6 +13355,12 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -10429,6 +13391,12 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10448,6 +13416,12 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10465,6 +13439,12 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10482,6 +13462,12 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10499,6 +13485,12 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10518,6 +13510,12 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10537,6 +13535,12 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -10556,6 +13560,12 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -10575,7 +13585,13 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -10606,6 +13622,12 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -10639,6 +13661,12 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -10672,6 +13700,12 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -10701,6 +13735,12 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -10731,6 +13771,12 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10750,6 +13796,12 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10767,6 +13819,12 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10784,6 +13842,12 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10801,6 +13865,12 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10820,6 +13890,12 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10839,6 +13915,12 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -10858,6 +13940,12 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -10877,7 +13965,13 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -10908,6 +14002,12 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -10941,6 +14041,12 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -10974,6 +14080,12 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -11003,6 +14115,12 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -11033,6 +14151,12 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11052,6 +14176,12 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11069,6 +14199,12 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11086,6 +14222,12 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11103,6 +14245,12 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11122,6 +14270,12 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11141,6 +14295,12 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -11160,6 +14320,12 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -11179,7 +14345,13 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -11208,15 +14380,19 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -11226,15 +14402,19 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -11244,29 +14424,37 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_unordered_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -11274,13 +14462,17 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11288,11 +14480,15 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -11300,23 +14496,31 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load( ; ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_one_as_unordered_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_unordered_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -11324,40 +14528,52 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load( ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_unordered_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_one_as_unordered_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_singlethread_one_as_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -11366,12 +14582,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load( ; ; GFX12-CU-LABEL: flat_singlethread_one_as_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -11381,12 +14601,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load( ; GFX1250-LABEL: flat_singlethread_one_as_unordered_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { @@ -11402,15 +14626,19 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -11420,15 +14648,19 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -11438,29 +14670,37 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -11468,13 +14708,17 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11482,11 +14726,15 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -11494,23 +14742,31 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load( ; ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -11518,40 +14774,52 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load( ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_monotonic_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_singlethread_one_as_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -11560,12 +14828,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load( ; ; GFX12-CU-LABEL: flat_singlethread_one_as_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -11575,12 +14847,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load( ; GFX1250-LABEL: flat_singlethread_one_as_monotonic_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { @@ -11596,15 +14872,19 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -11614,15 +14894,19 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -11632,29 +14916,37 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -11662,13 +14954,17 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11676,11 +14972,15 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -11688,23 +14988,31 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load( ; ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -11712,40 +15020,52 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load( ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_acquire_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_singlethread_one_as_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -11754,12 +15074,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load( ; ; GFX12-CU-LABEL: flat_singlethread_one_as_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -11769,12 +15093,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load( ; GFX1250-LABEL: flat_singlethread_one_as_acquire_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { @@ -11790,15 +15118,19 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -11808,15 +15140,19 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -11826,29 +15162,37 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -11856,13 +15200,17 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11870,11 +15218,15 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -11882,23 +15234,31 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load( ; ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -11906,40 +15266,52 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load( ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_singlethread_one_as_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -11948,12 +15320,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load( ; ; GFX12-CU-LABEL: flat_singlethread_one_as_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -11963,12 +15339,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load( ; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { @@ -11985,6 +15365,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -12000,6 +15384,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 @@ -12015,6 +15403,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 @@ -12026,6 +15418,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store( ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_unordered_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -12039,6 +15435,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -12051,6 +15451,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -12061,6 +15465,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store( ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_one_as_unordered_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -12071,6 +15479,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store( ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_unordered_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -12081,6 +15493,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store( ; GFX11-WGP-LABEL: flat_singlethread_one_as_unordered_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -12092,6 +15508,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store( ; GFX11-CU-LABEL: flat_singlethread_one_as_unordered_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -12103,6 +15523,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store( ; GFX12-WGP-LABEL: flat_singlethread_one_as_unordered_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -12114,6 +15538,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store( ; GFX12-CU-LABEL: flat_singlethread_one_as_unordered_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -12125,12 +15553,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store( ; GFX1250-LABEL: flat_singlethread_one_as_unordered_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { @@ -12146,6 +15578,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -12161,6 +15597,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 @@ -12176,6 +15616,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 @@ -12187,6 +15631,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store( ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -12200,6 +15648,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -12212,6 +15664,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -12222,6 +15678,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store( ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -12232,6 +15692,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store( ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -12242,6 +15706,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store( ; GFX11-WGP-LABEL: flat_singlethread_one_as_monotonic_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -12253,6 +15721,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store( ; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -12264,6 +15736,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store( ; GFX12-WGP-LABEL: flat_singlethread_one_as_monotonic_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -12275,6 +15751,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store( ; GFX12-CU-LABEL: flat_singlethread_one_as_monotonic_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -12286,12 +15766,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store( ; GFX1250-LABEL: flat_singlethread_one_as_monotonic_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { @@ -12307,6 +15791,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -12322,6 +15810,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 @@ -12337,6 +15829,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 @@ -12348,6 +15844,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store( ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -12361,6 +15861,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -12373,6 +15877,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -12383,6 +15891,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store( ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -12393,6 +15905,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store( ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_release_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -12403,6 +15919,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store( ; GFX11-WGP-LABEL: flat_singlethread_one_as_release_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -12414,6 +15934,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store( ; GFX11-CU-LABEL: flat_singlethread_one_as_release_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -12425,6 +15949,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store( ; GFX12-WGP-LABEL: flat_singlethread_one_as_release_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -12436,6 +15964,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store( ; GFX12-CU-LABEL: flat_singlethread_one_as_release_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -12447,12 +15979,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store( ; GFX1250-LABEL: flat_singlethread_one_as_release_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { @@ -12468,6 +16004,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -12483,6 +16023,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 @@ -12498,6 +16042,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 @@ -12509,6 +16057,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store( ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -12522,6 +16074,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -12534,6 +16090,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -12544,6 +16104,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store( ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -12554,6 +16118,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store( ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -12564,6 +16132,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store( ; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -12575,6 +16147,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store( ; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -12586,6 +16162,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store( ; GFX12-WGP-LABEL: flat_singlethread_one_as_seq_cst_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -12597,6 +16177,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store( ; GFX12-CU-LABEL: flat_singlethread_one_as_seq_cst_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -12608,12 +16192,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store( ; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { @@ -12628,11 +16216,15 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm @@ -12643,11 +16235,15 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm @@ -12658,22 +16254,30 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -12682,10 +16286,14 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -12694,74 +16302,102 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm @@ -12769,7 +16405,11 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw( ; GFX1250-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -12789,11 +16429,15 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm @@ -12804,11 +16448,15 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm @@ -12819,22 +16467,30 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -12843,10 +16499,14 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -12855,74 +16515,102 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_acquire_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_singlethread_one_as_acquire_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_singlethread_one_as_acquire_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm @@ -12930,7 +16618,11 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw( ; GFX1250-LABEL: flat_singlethread_one_as_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -12950,11 +16642,15 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm @@ -12965,11 +16661,15 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm @@ -12980,22 +16680,30 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -13004,10 +16712,14 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -13016,74 +16728,102 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_release_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_release_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_one_as_release_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_singlethread_one_as_release_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_singlethread_one_as_release_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm @@ -13091,7 +16831,11 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw( ; GFX1250-LABEL: flat_singlethread_one_as_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -13111,11 +16855,15 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm @@ -13126,11 +16874,15 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm @@ -13141,22 +16893,30 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -13165,10 +16925,14 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -13177,74 +16941,102 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm @@ -13252,7 +17044,11 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw( ; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -13272,11 +17068,15 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm @@ -13287,11 +17087,15 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm @@ -13302,22 +17106,30 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -13326,10 +17138,14 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -13338,74 +17154,102 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm @@ -13413,7 +17257,11 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw( ; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -13434,6 +17282,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -13453,6 +17305,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -13472,6 +17328,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -13487,6 +17347,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -13504,6 +17368,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] @@ -13519,6 +17387,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] @@ -13532,6 +17404,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -13545,6 +17421,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -13558,6 +17438,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw( ; GFX11-WGP-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -13573,6 +17457,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw( ; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -13588,6 +17476,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw( ; GFX12-WGP-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -13603,6 +17495,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw( ; GFX12-CU-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -13618,7 +17514,11 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw( ; GFX1250-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -13642,6 +17542,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -13661,6 +17565,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -13680,6 +17588,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -13695,6 +17607,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -13712,6 +17628,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] @@ -13727,6 +17647,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] @@ -13740,6 +17664,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -13753,6 +17681,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -13766,6 +17698,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX11-WGP-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -13781,6 +17717,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX11-CU-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -13796,6 +17736,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX12-WGP-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -13811,6 +17755,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -13826,7 +17774,11 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -13850,6 +17802,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -13869,6 +17825,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -13888,6 +17848,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -13903,6 +17867,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -13920,6 +17888,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] @@ -13935,6 +17907,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] @@ -13948,6 +17924,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -13961,6 +17941,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -13974,6 +17958,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -13989,6 +17977,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -14004,6 +17996,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX12-WGP-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -14019,6 +18015,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -14034,7 +18034,11 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -14058,11 +18062,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -14072,6 +18081,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -14087,11 +18097,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -14101,6 +18116,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -14116,11 +18132,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -14130,6 +18151,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -14141,11 +18163,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -14155,6 +18182,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -14168,6 +18196,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14184,6 +18218,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14198,6 +18238,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14212,6 +18258,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14226,6 +18278,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg( ; GFX11-WGP-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -14241,6 +18299,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg( ; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -14256,6 +18320,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -14271,6 +18341,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -14286,7 +18362,13 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg( ; GFX1250-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -14312,11 +18394,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -14326,6 +18413,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -14341,11 +18429,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -14355,6 +18448,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -14370,11 +18464,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -14384,6 +18483,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -14395,11 +18495,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -14409,6 +18514,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -14422,6 +18528,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14438,6 +18550,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14452,6 +18570,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14466,6 +18590,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14480,6 +18610,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX11-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -14495,6 +18631,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -14510,6 +18652,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -14525,6 +18673,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -14540,7 +18694,13 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX1250-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -14566,11 +18726,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -14580,6 +18745,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -14595,11 +18761,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -14609,6 +18780,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -14624,11 +18796,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -14638,6 +18815,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -14649,11 +18827,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -14663,6 +18846,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -14676,6 +18860,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14692,6 +18882,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14706,6 +18902,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14720,6 +18922,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14734,6 +18942,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg( ; GFX11-WGP-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -14749,6 +18963,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg( ; GFX11-CU-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -14764,6 +18984,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -14779,6 +19005,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -14794,7 +19026,13 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg( ; GFX1250-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -14820,11 +19058,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -14834,6 +19077,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -14849,11 +19093,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -14863,6 +19112,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -14878,11 +19128,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -14892,6 +19147,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -14903,11 +19159,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -14917,6 +19178,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -14930,6 +19192,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14946,6 +19214,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14960,6 +19234,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14974,6 +19254,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14988,6 +19274,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX11-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -15003,6 +19295,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX11-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -15018,6 +19316,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -15033,6 +19337,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -15048,7 +19358,13 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -15074,11 +19390,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -15088,6 +19409,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -15103,11 +19425,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -15117,6 +19444,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -15132,11 +19460,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -15146,6 +19479,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -15157,11 +19491,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -15171,6 +19510,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -15184,6 +19524,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15200,6 +19546,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15214,6 +19566,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15228,6 +19586,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15242,6 +19606,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -15257,6 +19627,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -15272,6 +19648,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -15287,6 +19669,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -15302,7 +19690,13 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -15328,11 +19722,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -15342,6 +19741,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -15357,11 +19757,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -15371,6 +19776,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -15386,11 +19792,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -15400,6 +19811,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -15411,11 +19823,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -15425,6 +19842,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -15438,6 +19856,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15454,6 +19878,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15468,6 +19898,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15482,6 +19918,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15496,6 +19938,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX11-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -15511,6 +19959,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -15526,6 +19980,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -15541,6 +20001,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -15556,7 +20022,13 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX1250-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -15582,11 +20054,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -15596,6 +20073,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -15611,11 +20089,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -15625,6 +20108,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -15640,11 +20124,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -15654,6 +20143,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -15665,11 +20155,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -15679,6 +20174,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -15692,6 +20188,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15708,6 +20210,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15722,6 +20230,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15736,6 +20250,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15750,6 +20270,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX11-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -15765,6 +20291,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -15780,6 +20312,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -15795,6 +20333,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -15810,7 +20354,13 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX1250-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -15836,11 +20386,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -15850,6 +20405,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -15865,11 +20421,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -15879,6 +20440,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -15894,11 +20456,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -15908,6 +20475,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -15919,11 +20487,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -15933,6 +20506,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -15946,6 +20520,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15962,6 +20542,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15976,6 +20562,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15990,6 +20582,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16004,6 +20602,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg( ; GFX11-WGP-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16019,6 +20623,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg( ; GFX11-CU-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16034,6 +20644,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -16049,6 +20665,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -16064,7 +20686,13 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg( ; GFX1250-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -16090,11 +20718,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -16104,6 +20737,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -16119,11 +20753,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -16133,6 +20772,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -16148,11 +20788,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -16162,6 +20807,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -16173,11 +20819,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -16187,6 +20838,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -16200,6 +20852,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16216,6 +20874,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16230,6 +20894,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16244,6 +20914,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16258,6 +20934,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX11-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16273,6 +20955,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX11-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16288,6 +20976,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -16303,6 +20997,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -16318,7 +21018,13 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -16344,11 +21050,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -16358,6 +21069,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -16373,11 +21085,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -16387,6 +21104,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -16402,11 +21120,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -16416,6 +21139,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -16427,11 +21151,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -16441,6 +21170,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -16454,6 +21184,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16470,6 +21206,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16484,6 +21226,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16498,6 +21246,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16512,6 +21266,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16527,6 +21287,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16542,6 +21308,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -16557,6 +21329,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -16572,7 +21350,13 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -16598,11 +21382,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -16612,6 +21401,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -16627,11 +21417,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -16641,6 +21436,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -16656,11 +21452,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -16670,6 +21471,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -16681,11 +21483,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -16695,6 +21502,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -16708,6 +21516,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16724,6 +21538,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16738,6 +21558,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16752,6 +21578,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16766,6 +21598,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16781,6 +21619,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16796,6 +21640,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -16811,6 +21661,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -16826,7 +21682,13 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX1250-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -16852,11 +21714,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -16866,6 +21733,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -16881,11 +21749,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -16895,6 +21768,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -16910,11 +21784,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -16924,6 +21803,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -16935,11 +21815,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -16949,6 +21834,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -16962,6 +21848,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16978,6 +21870,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16992,6 +21890,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17006,6 +21910,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17020,6 +21930,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17035,6 +21951,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17050,6 +21972,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -17065,6 +21993,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -17080,7 +22014,13 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX1250-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -17106,11 +22046,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -17120,6 +22065,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -17135,11 +22081,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -17149,6 +22100,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -17164,11 +22116,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -17178,6 +22135,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -17189,11 +22147,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -17203,6 +22166,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -17216,6 +22180,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17232,6 +22202,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17246,6 +22222,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17260,6 +22242,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17274,6 +22262,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17289,6 +22283,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX11-CU-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17304,6 +22304,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -17319,6 +22325,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -17334,7 +22346,13 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX1250-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -17360,11 +22378,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -17374,6 +22397,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -17389,11 +22413,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -17403,6 +22432,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -17418,11 +22448,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -17432,6 +22467,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -17443,11 +22479,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -17457,6 +22498,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -17470,6 +22512,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17486,6 +22534,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17500,6 +22554,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17514,6 +22574,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17528,6 +22594,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17543,6 +22615,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX11-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17558,6 +22636,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -17573,6 +22657,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -17588,7 +22678,13 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -17614,11 +22710,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -17628,6 +22729,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -17643,11 +22745,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -17657,6 +22764,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -17672,11 +22780,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -17686,6 +22799,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -17697,11 +22811,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -17711,6 +22830,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -17724,6 +22844,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17740,6 +22866,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17754,6 +22886,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17768,6 +22906,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17782,6 +22926,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17797,6 +22947,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17812,6 +22968,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -17827,6 +22989,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -17842,7 +23010,13 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -17869,6 +23043,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -17902,6 +23082,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -17935,6 +23121,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -17964,6 +23156,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -17994,6 +23192,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18013,6 +23217,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18030,6 +23240,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18047,6 +23263,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18064,6 +23286,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx ; GFX11-WGP-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -18083,6 +23311,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx ; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -18102,6 +23336,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx ; GFX12-WGP-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -18121,6 +23361,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx ; GFX12-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -18140,7 +23386,13 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx ; GFX1250-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -18171,6 +23423,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -18204,6 +23462,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -18237,6 +23501,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -18266,6 +23536,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -18296,6 +23572,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18315,6 +23597,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18332,6 +23620,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18349,6 +23643,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18366,6 +23666,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch ; GFX11-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -18385,6 +23691,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch ; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -18404,6 +23716,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch ; GFX12-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -18423,6 +23741,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch ; GFX12-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -18442,7 +23766,13 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch ; GFX1250-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -18473,6 +23803,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -18506,6 +23842,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -18539,6 +23881,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -18568,6 +23916,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -18598,6 +23952,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18617,6 +23977,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18634,6 +24000,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18651,6 +24023,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18668,6 +24046,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch ; GFX11-WGP-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -18687,6 +24071,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch ; GFX11-CU-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -18706,6 +24096,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch ; GFX12-WGP-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -18725,6 +24121,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch ; GFX12-CU-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -18744,7 +24146,13 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch ; GFX1250-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -18775,6 +24183,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -18808,6 +24222,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -18841,6 +24261,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -18870,6 +24296,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -18900,6 +24332,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18919,6 +24357,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18936,6 +24380,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18953,6 +24403,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18970,6 +24426,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch ; GFX11-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -18989,6 +24451,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch ; GFX11-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -19008,6 +24476,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch ; GFX12-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -19027,6 +24501,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch ; GFX12-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -19046,7 +24526,13 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch ; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -19077,6 +24563,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -19110,6 +24602,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -19143,6 +24641,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -19172,6 +24676,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -19202,6 +24712,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19221,6 +24737,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19238,6 +24760,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19255,6 +24783,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19272,6 +24806,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch ; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -19291,6 +24831,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch ; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -19310,6 +24856,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch ; GFX12-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -19329,6 +24881,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch ; GFX12-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -19348,7 +24906,13 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch ; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -19379,6 +24943,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -19412,6 +24982,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -19445,6 +25021,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -19474,6 +25056,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -19504,6 +25092,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19523,6 +25117,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19540,6 +25140,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19557,6 +25163,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19574,6 +25186,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch ; GFX11-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -19593,6 +25211,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch ; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -19612,6 +25236,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch ; GFX12-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -19631,6 +25261,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch ; GFX12-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -19650,7 +25286,13 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch ; GFX1250-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -19681,6 +25323,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -19714,6 +25362,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -19747,6 +25401,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -19776,6 +25436,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -19806,6 +25472,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19825,6 +25497,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19842,6 +25520,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19859,6 +25543,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19876,6 +25566,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -19895,6 +25591,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -19914,6 +25616,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -19933,6 +25641,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -19952,7 +25666,13 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg( ; GFX1250-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -19983,6 +25703,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -20016,6 +25742,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -20049,6 +25781,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -20078,6 +25816,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -20108,6 +25852,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20127,6 +25877,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20144,6 +25900,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20161,6 +25923,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20178,6 +25946,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -20197,6 +25971,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -20216,6 +25996,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -20235,6 +26021,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -20254,7 +26046,13 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg( ; GFX1250-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -20285,6 +26083,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -20318,6 +26122,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -20351,6 +26161,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -20380,6 +26196,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -20410,6 +26232,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20429,6 +26257,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20446,6 +26280,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20463,6 +26303,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20480,6 +26326,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -20499,6 +26351,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -20518,6 +26376,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -20537,6 +26401,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -20556,7 +26426,13 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -20587,6 +26463,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -20620,6 +26502,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -20653,6 +26541,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -20682,6 +26576,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -20712,6 +26612,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20731,6 +26637,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20748,6 +26660,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20765,6 +26683,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20782,6 +26706,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -20801,6 +26731,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -20820,6 +26756,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -20839,6 +26781,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -20858,7 +26806,13 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -20889,6 +26843,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -20922,6 +26882,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -20955,6 +26921,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -20984,6 +26956,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -21014,6 +26992,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21033,6 +27017,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21050,6 +27040,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21067,6 +27063,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21084,6 +27086,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch ; GFX11-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -21103,6 +27111,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch ; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -21122,6 +27136,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch ; GFX12-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -21141,6 +27161,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch ; GFX12-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -21160,7 +27186,13 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch ; GFX1250-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -21191,6 +27223,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -21224,6 +27262,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -21257,6 +27301,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -21286,6 +27336,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -21316,6 +27372,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21335,6 +27397,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21352,6 +27420,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21369,6 +27443,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21386,6 +27466,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -21405,6 +27491,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -21424,6 +27516,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -21443,6 +27541,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -21462,7 +27566,13 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -21493,6 +27603,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -21526,6 +27642,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -21559,6 +27681,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -21588,6 +27716,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -21618,6 +27752,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21637,6 +27777,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21654,6 +27800,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21671,6 +27823,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21688,6 +27846,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -21707,6 +27871,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -21726,6 +27896,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -21745,6 +27921,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -21764,7 +27946,13 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -21795,6 +27983,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -21828,6 +28022,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -21861,6 +28061,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -21890,6 +28096,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -21920,6 +28132,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21939,6 +28157,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21956,6 +28180,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21973,6 +28203,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21990,6 +28226,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -22009,6 +28251,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -22028,6 +28276,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -22047,6 +28301,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -22066,7 +28326,13 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -22097,6 +28363,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -22130,6 +28402,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -22163,6 +28441,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -22192,6 +28476,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -22222,6 +28512,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -22241,6 +28537,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -22258,6 +28560,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -22275,6 +28583,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -22292,6 +28606,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -22311,6 +28631,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -22330,6 +28656,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -22349,6 +28681,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -22368,7 +28706,13 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll index 996dbc4d30d2..124d0f225db6 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll @@ -19,15 +19,19 @@ define amdgpu_kernel void @flat_system_unordered_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -37,15 +41,19 @@ define amdgpu_kernel void @flat_system_unordered_load( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -55,29 +63,37 @@ define amdgpu_kernel void @flat_system_unordered_load( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_system_unordered_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -85,13 +101,17 @@ define amdgpu_kernel void @flat_system_unordered_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -99,11 +119,15 @@ define amdgpu_kernel void @flat_system_unordered_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -111,23 +135,31 @@ define amdgpu_kernel void @flat_system_unordered_load( ; ; GFX942-NOTTGSPLIT-LABEL: flat_system_unordered_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_system_unordered_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -135,40 +167,52 @@ define amdgpu_kernel void @flat_system_unordered_load( ; ; GFX11-WGP-LABEL: flat_system_unordered_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_system_unordered_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_system_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -177,12 +221,16 @@ define amdgpu_kernel void @flat_system_unordered_load( ; ; GFX12-CU-LABEL: flat_system_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -192,12 +240,16 @@ define amdgpu_kernel void @flat_system_unordered_load( ; GFX1250-LABEL: flat_system_unordered_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { @@ -213,15 +265,19 @@ define amdgpu_kernel void @flat_system_monotonic_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] glc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -231,15 +287,19 @@ define amdgpu_kernel void @flat_system_monotonic_load( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc dlc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -249,29 +309,37 @@ define amdgpu_kernel void @flat_system_monotonic_load( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] glc dlc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_system_monotonic_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -279,13 +347,17 @@ define amdgpu_kernel void @flat_system_monotonic_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -293,11 +365,15 @@ define amdgpu_kernel void @flat_system_monotonic_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -305,23 +381,31 @@ define amdgpu_kernel void @flat_system_monotonic_load( ; ; GFX942-NOTTGSPLIT-LABEL: flat_system_monotonic_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_system_monotonic_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -329,40 +413,52 @@ define amdgpu_kernel void @flat_system_monotonic_load( ; ; GFX11-WGP-LABEL: flat_system_monotonic_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_system_monotonic_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_system_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_SYS +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -371,12 +467,16 @@ define amdgpu_kernel void @flat_system_monotonic_load( ; ; GFX12-CU-LABEL: flat_system_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_SYS +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -386,12 +486,16 @@ define amdgpu_kernel void @flat_system_monotonic_load( ; GFX1250-LABEL: flat_system_monotonic_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { @@ -407,9 +511,12 @@ define amdgpu_kernel void @flat_system_acquire_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] glc @@ -426,9 +533,12 @@ define amdgpu_kernel void @flat_system_acquire_load( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc dlc @@ -446,9 +556,12 @@ define amdgpu_kernel void @flat_system_acquire_load( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] glc dlc @@ -462,9 +575,12 @@ define amdgpu_kernel void @flat_system_acquire_load( ; ; SKIP-CACHE-INV-LABEL: flat_system_acquire_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] glc @@ -478,9 +594,12 @@ define amdgpu_kernel void @flat_system_acquire_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -494,23 +613,30 @@ define amdgpu_kernel void @flat_system_acquire_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_system_acquire_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 sc1 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -521,22 +647,29 @@ define amdgpu_kernel void @flat_system_acquire_load( ; ; GFX942-TGSPLIT-LABEL: flat_system_acquire_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 sc1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_system_acquire_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc @@ -550,9 +683,12 @@ define amdgpu_kernel void @flat_system_acquire_load( ; ; GFX11-CU-LABEL: flat_system_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc @@ -566,14 +702,18 @@ define amdgpu_kernel void @flat_system_acquire_load( ; ; GFX12-WGP-LABEL: flat_system_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -581,14 +721,18 @@ define amdgpu_kernel void @flat_system_acquire_load( ; ; GFX12-CU-LABEL: flat_system_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -597,14 +741,18 @@ define amdgpu_kernel void @flat_system_acquire_load( ; GFX1250-LABEL: flat_system_acquire_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { @@ -620,9 +768,12 @@ define amdgpu_kernel void @flat_system_seq_cst_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -640,9 +791,12 @@ define amdgpu_kernel void @flat_system_seq_cst_load( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -662,9 +816,12 @@ define amdgpu_kernel void @flat_system_seq_cst_load( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -680,9 +837,12 @@ define amdgpu_kernel void @flat_system_seq_cst_load( ; ; SKIP-CACHE-INV-LABEL: flat_system_seq_cst_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -697,9 +857,12 @@ define amdgpu_kernel void @flat_system_seq_cst_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc @@ -714,9 +877,12 @@ define amdgpu_kernel void @flat_system_seq_cst_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc @@ -729,9 +895,12 @@ define amdgpu_kernel void @flat_system_seq_cst_load( ; ; GFX942-NOTTGSPLIT-LABEL: flat_system_seq_cst_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 sc1 @@ -743,9 +912,12 @@ define amdgpu_kernel void @flat_system_seq_cst_load( ; ; GFX942-TGSPLIT-LABEL: flat_system_seq_cst_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 sc1 @@ -757,9 +929,12 @@ define amdgpu_kernel void @flat_system_seq_cst_load( ; ; GFX11-WGP-LABEL: flat_system_seq_cst_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -775,9 +950,12 @@ define amdgpu_kernel void @flat_system_seq_cst_load( ; ; GFX11-CU-LABEL: flat_system_seq_cst_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -793,9 +971,12 @@ define amdgpu_kernel void @flat_system_seq_cst_load( ; ; GFX12-WGP-LABEL: flat_system_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 @@ -805,6 +986,7 @@ define amdgpu_kernel void @flat_system_seq_cst_load( ; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -812,9 +994,12 @@ define amdgpu_kernel void @flat_system_seq_cst_load( ; ; GFX12-CU-LABEL: flat_system_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 @@ -824,6 +1009,7 @@ define amdgpu_kernel void @flat_system_seq_cst_load( ; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -832,16 +1018,20 @@ define amdgpu_kernel void @flat_system_seq_cst_load( ; GFX1250-LABEL: flat_system_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { @@ -858,6 +1048,10 @@ define amdgpu_kernel void @flat_system_unordered_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -873,6 +1067,10 @@ define amdgpu_kernel void @flat_system_unordered_store( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 @@ -888,6 +1086,10 @@ define amdgpu_kernel void @flat_system_unordered_store( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 @@ -899,6 +1101,10 @@ define amdgpu_kernel void @flat_system_unordered_store( ; SKIP-CACHE-INV-LABEL: flat_system_unordered_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -912,6 +1118,10 @@ define amdgpu_kernel void @flat_system_unordered_store( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -924,6 +1134,10 @@ define amdgpu_kernel void @flat_system_unordered_store( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -934,6 +1148,10 @@ define amdgpu_kernel void @flat_system_unordered_store( ; GFX942-NOTTGSPLIT-LABEL: flat_system_unordered_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -944,6 +1162,10 @@ define amdgpu_kernel void @flat_system_unordered_store( ; GFX942-TGSPLIT-LABEL: flat_system_unordered_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -954,6 +1176,10 @@ define amdgpu_kernel void @flat_system_unordered_store( ; GFX11-WGP-LABEL: flat_system_unordered_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -965,6 +1191,10 @@ define amdgpu_kernel void @flat_system_unordered_store( ; GFX11-CU-LABEL: flat_system_unordered_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -976,6 +1206,10 @@ define amdgpu_kernel void @flat_system_unordered_store( ; GFX12-WGP-LABEL: flat_system_unordered_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -987,6 +1221,10 @@ define amdgpu_kernel void @flat_system_unordered_store( ; GFX12-CU-LABEL: flat_system_unordered_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -998,12 +1236,16 @@ define amdgpu_kernel void @flat_system_unordered_store( ; GFX1250-LABEL: flat_system_unordered_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { @@ -1019,6 +1261,10 @@ define amdgpu_kernel void @flat_system_monotonic_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -1034,6 +1280,10 @@ define amdgpu_kernel void @flat_system_monotonic_store( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 @@ -1049,6 +1299,10 @@ define amdgpu_kernel void @flat_system_monotonic_store( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 @@ -1060,6 +1314,10 @@ define amdgpu_kernel void @flat_system_monotonic_store( ; SKIP-CACHE-INV-LABEL: flat_system_monotonic_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -1073,6 +1331,10 @@ define amdgpu_kernel void @flat_system_monotonic_store( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -1085,6 +1347,10 @@ define amdgpu_kernel void @flat_system_monotonic_store( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -1095,6 +1361,10 @@ define amdgpu_kernel void @flat_system_monotonic_store( ; GFX942-NOTTGSPLIT-LABEL: flat_system_monotonic_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -1105,6 +1375,10 @@ define amdgpu_kernel void @flat_system_monotonic_store( ; GFX942-TGSPLIT-LABEL: flat_system_monotonic_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -1115,6 +1389,10 @@ define amdgpu_kernel void @flat_system_monotonic_store( ; GFX11-WGP-LABEL: flat_system_monotonic_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -1126,6 +1404,10 @@ define amdgpu_kernel void @flat_system_monotonic_store( ; GFX11-CU-LABEL: flat_system_monotonic_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -1137,6 +1419,10 @@ define amdgpu_kernel void @flat_system_monotonic_store( ; GFX12-WGP-LABEL: flat_system_monotonic_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -1148,6 +1434,10 @@ define amdgpu_kernel void @flat_system_monotonic_store( ; GFX12-CU-LABEL: flat_system_monotonic_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -1159,12 +1449,16 @@ define amdgpu_kernel void @flat_system_monotonic_store( ; GFX1250-LABEL: flat_system_monotonic_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { @@ -1180,6 +1474,10 @@ define amdgpu_kernel void @flat_system_release_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -1196,6 +1494,10 @@ define amdgpu_kernel void @flat_system_release_store( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 @@ -1213,6 +1515,10 @@ define amdgpu_kernel void @flat_system_release_store( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 @@ -1226,6 +1532,10 @@ define amdgpu_kernel void @flat_system_release_store( ; SKIP-CACHE-INV-LABEL: flat_system_release_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -1240,6 +1550,10 @@ define amdgpu_kernel void @flat_system_release_store( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -1254,6 +1568,10 @@ define amdgpu_kernel void @flat_system_release_store( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -1266,6 +1584,10 @@ define amdgpu_kernel void @flat_system_release_store( ; GFX942-NOTTGSPLIT-LABEL: flat_system_release_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -1278,6 +1600,10 @@ define amdgpu_kernel void @flat_system_release_store( ; GFX942-TGSPLIT-LABEL: flat_system_release_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -1290,6 +1616,10 @@ define amdgpu_kernel void @flat_system_release_store( ; GFX11-WGP-LABEL: flat_system_release_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -1303,6 +1633,10 @@ define amdgpu_kernel void @flat_system_release_store( ; GFX11-CU-LABEL: flat_system_release_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -1316,6 +1650,10 @@ define amdgpu_kernel void @flat_system_release_store( ; GFX12-WGP-LABEL: flat_system_release_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -1332,6 +1670,10 @@ define amdgpu_kernel void @flat_system_release_store( ; GFX12-CU-LABEL: flat_system_release_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -1348,10 +1690,13 @@ define amdgpu_kernel void @flat_system_release_store( ; GFX1250-LABEL: flat_system_release_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -1359,6 +1704,7 @@ define amdgpu_kernel void @flat_system_release_store( ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { @@ -1374,6 +1720,10 @@ define amdgpu_kernel void @flat_system_seq_cst_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -1390,6 +1740,10 @@ define amdgpu_kernel void @flat_system_seq_cst_store( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 @@ -1407,6 +1761,10 @@ define amdgpu_kernel void @flat_system_seq_cst_store( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 @@ -1420,6 +1778,10 @@ define amdgpu_kernel void @flat_system_seq_cst_store( ; SKIP-CACHE-INV-LABEL: flat_system_seq_cst_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -1434,6 +1796,10 @@ define amdgpu_kernel void @flat_system_seq_cst_store( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -1448,6 +1814,10 @@ define amdgpu_kernel void @flat_system_seq_cst_store( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -1460,6 +1830,10 @@ define amdgpu_kernel void @flat_system_seq_cst_store( ; GFX942-NOTTGSPLIT-LABEL: flat_system_seq_cst_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -1472,6 +1846,10 @@ define amdgpu_kernel void @flat_system_seq_cst_store( ; GFX942-TGSPLIT-LABEL: flat_system_seq_cst_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -1484,6 +1862,10 @@ define amdgpu_kernel void @flat_system_seq_cst_store( ; GFX11-WGP-LABEL: flat_system_seq_cst_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -1497,6 +1879,10 @@ define amdgpu_kernel void @flat_system_seq_cst_store( ; GFX11-CU-LABEL: flat_system_seq_cst_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -1510,6 +1896,10 @@ define amdgpu_kernel void @flat_system_seq_cst_store( ; GFX12-WGP-LABEL: flat_system_seq_cst_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -1526,6 +1916,10 @@ define amdgpu_kernel void @flat_system_seq_cst_store( ; GFX12-CU-LABEL: flat_system_seq_cst_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -1542,10 +1936,13 @@ define amdgpu_kernel void @flat_system_seq_cst_store( ; GFX1250-LABEL: flat_system_seq_cst_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -1553,6 +1950,7 @@ define amdgpu_kernel void @flat_system_seq_cst_store( ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { @@ -1567,11 +1965,15 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm @@ -1582,11 +1984,15 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm @@ -1597,22 +2003,30 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_system_monotonic_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -1621,10 +2035,14 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -1633,74 +2051,102 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_system_monotonic_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 sc1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_system_monotonic_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 sc1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_system_monotonic_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_system_monotonic_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_system_monotonic_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_system_monotonic_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -1708,7 +2154,11 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw( ; GFX1250-LABEL: flat_system_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -1728,11 +2178,15 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1745,11 +2199,15 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1764,11 +2222,15 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1779,11 +2241,15 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_system_acquire_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1793,10 +2259,14 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1808,10 +2278,14 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -1821,10 +2295,14 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: flat_system_acquire_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 sc1 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1833,10 +2311,14 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: flat_system_acquire_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 sc1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -1845,11 +2327,15 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw( ; ; GFX11-WGP-LABEL: flat_system_acquire_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1860,11 +2346,15 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw( ; ; GFX11-CU-LABEL: flat_system_acquire_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1875,11 +2365,15 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw( ; ; GFX12-WGP-LABEL: flat_system_acquire_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 @@ -1888,11 +2382,15 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw( ; ; GFX12-CU-LABEL: flat_system_acquire_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 @@ -1902,7 +2400,11 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw( ; GFX1250-LABEL: flat_system_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -1925,11 +2427,15 @@ define amdgpu_kernel void @flat_system_release_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 @@ -1941,11 +2447,15 @@ define amdgpu_kernel void @flat_system_release_atomicrmw( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1958,11 +2468,15 @@ define amdgpu_kernel void @flat_system_release_atomicrmw( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1971,11 +2485,15 @@ define amdgpu_kernel void @flat_system_release_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_system_release_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 @@ -1985,10 +2503,14 @@ define amdgpu_kernel void @flat_system_release_atomicrmw( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1999,10 +2521,14 @@ define amdgpu_kernel void @flat_system_release_atomicrmw( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2011,10 +2537,14 @@ define amdgpu_kernel void @flat_system_release_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: flat_system_release_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2023,10 +2553,14 @@ define amdgpu_kernel void @flat_system_release_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: flat_system_release_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2035,11 +2569,15 @@ define amdgpu_kernel void @flat_system_release_atomicrmw( ; ; GFX11-WGP-LABEL: flat_system_release_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2048,11 +2586,15 @@ define amdgpu_kernel void @flat_system_release_atomicrmw( ; ; GFX11-CU-LABEL: flat_system_release_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2061,11 +2603,15 @@ define amdgpu_kernel void @flat_system_release_atomicrmw( ; ; GFX12-WGP-LABEL: flat_system_release_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 @@ -2077,11 +2623,15 @@ define amdgpu_kernel void @flat_system_release_atomicrmw( ; ; GFX12-CU-LABEL: flat_system_release_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 @@ -2094,7 +2644,11 @@ define amdgpu_kernel void @flat_system_release_atomicrmw( ; GFX1250-LABEL: flat_system_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2119,11 +2673,15 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 @@ -2137,11 +2695,15 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2158,11 +2720,15 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2175,11 +2741,15 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_system_acq_rel_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 @@ -2190,10 +2760,14 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2207,10 +2781,14 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2222,10 +2800,14 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: flat_system_acq_rel_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2236,10 +2818,14 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: flat_system_acq_rel_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2250,11 +2836,15 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw( ; ; GFX11-WGP-LABEL: flat_system_acq_rel_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2267,11 +2857,15 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw( ; ; GFX11-CU-LABEL: flat_system_acq_rel_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2284,11 +2878,15 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw( ; ; GFX12-WGP-LABEL: flat_system_acq_rel_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 @@ -2302,11 +2900,15 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw( ; ; GFX12-CU-LABEL: flat_system_acq_rel_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 @@ -2321,7 +2923,11 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw( ; GFX1250-LABEL: flat_system_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2349,11 +2955,15 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 @@ -2367,11 +2977,15 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2388,11 +3002,15 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2405,11 +3023,15 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_system_seq_cst_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 @@ -2420,10 +3042,14 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2437,10 +3063,14 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2452,10 +3082,14 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: flat_system_seq_cst_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2466,10 +3100,14 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: flat_system_seq_cst_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2480,11 +3118,15 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw( ; ; GFX11-WGP-LABEL: flat_system_seq_cst_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2497,11 +3139,15 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw( ; ; GFX11-CU-LABEL: flat_system_seq_cst_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2514,11 +3160,15 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw( ; ; GFX12-WGP-LABEL: flat_system_seq_cst_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 @@ -2532,11 +3182,15 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw( ; ; GFX12-CU-LABEL: flat_system_seq_cst_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 @@ -2551,7 +3205,11 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw( ; GFX1250-LABEL: flat_system_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2580,6 +3238,10 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -2600,6 +3262,10 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -2621,6 +3287,10 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -2638,6 +3308,10 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: flat_system_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -2655,6 +3329,10 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] @@ -2672,6 +3350,10 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] @@ -2687,6 +3369,10 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: flat_system_acquire_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -2701,6 +3387,10 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: flat_system_acquire_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -2715,6 +3405,10 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw( ; GFX11-WGP-LABEL: flat_system_acquire_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -2732,6 +3426,10 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw( ; GFX11-CU-LABEL: flat_system_acquire_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2749,6 +3447,10 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw( ; GFX12-WGP-LABEL: flat_system_acquire_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -2765,6 +3467,10 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw( ; GFX12-CU-LABEL: flat_system_acquire_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2781,7 +3487,11 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw( ; GFX1250-LABEL: flat_system_acquire_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2807,6 +3517,10 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -2828,6 +3542,10 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -2851,6 +3569,10 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -2870,6 +3592,10 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: flat_system_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -2888,6 +3614,10 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] @@ -2907,6 +3637,10 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] @@ -2924,6 +3658,10 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: flat_system_acq_rel_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -2940,6 +3678,10 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: flat_system_acq_rel_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -2956,6 +3698,10 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw( ; GFX11-WGP-LABEL: flat_system_acq_rel_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -2975,6 +3721,10 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw( ; GFX11-CU-LABEL: flat_system_acq_rel_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2994,6 +3744,10 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw( ; GFX12-WGP-LABEL: flat_system_acq_rel_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -3015,6 +3769,10 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw( ; GFX12-CU-LABEL: flat_system_acq_rel_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -3036,7 +3794,11 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw( ; GFX1250-LABEL: flat_system_acq_rel_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -3067,6 +3829,10 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -3088,6 +3854,10 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -3111,6 +3881,10 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -3130,6 +3904,10 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: flat_system_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -3148,6 +3926,10 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] @@ -3167,6 +3949,10 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] @@ -3184,6 +3970,10 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: flat_system_seq_cst_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -3200,6 +3990,10 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: flat_system_seq_cst_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -3216,6 +4010,10 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw( ; GFX11-WGP-LABEL: flat_system_seq_cst_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -3235,6 +4033,10 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw( ; GFX11-CU-LABEL: flat_system_seq_cst_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -3254,6 +4056,10 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw( ; GFX12-WGP-LABEL: flat_system_seq_cst_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -3275,6 +4081,10 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw( ; GFX12-CU-LABEL: flat_system_seq_cst_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -3296,7 +4106,11 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw( ; GFX1250-LABEL: flat_system_seq_cst_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -3327,11 +4141,16 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -3341,6 +4160,7 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -3356,11 +4176,16 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -3370,6 +4195,7 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -3385,11 +4211,16 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -3399,6 +4230,7 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -3410,11 +4242,16 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_system_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -3424,6 +4261,7 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -3437,6 +4275,12 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3453,6 +4297,12 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3467,6 +4317,12 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_system_monotonic_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3481,6 +4337,12 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_system_monotonic_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3495,6 +4357,12 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg( ; GFX11-WGP-LABEL: flat_system_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -3510,6 +4378,12 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg( ; GFX11-CU-LABEL: flat_system_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -3525,6 +4399,12 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_system_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -3540,6 +4420,12 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_system_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -3555,7 +4441,13 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg( ; GFX1250-LABEL: flat_system_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -3581,11 +4473,16 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -3595,6 +4492,7 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -3612,11 +4510,16 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -3626,6 +4529,7 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -3645,11 +4549,16 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -3659,6 +4568,7 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -3674,11 +4584,16 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_system_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -3688,6 +4603,7 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -3702,6 +4618,12 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3721,6 +4643,12 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3738,6 +4666,12 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_system_acquire_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3754,6 +4688,12 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_system_acquire_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3770,6 +4710,12 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg( ; GFX11-WGP-LABEL: flat_system_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -3789,6 +4735,12 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg( ; GFX11-CU-LABEL: flat_system_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -3808,6 +4760,12 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_system_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -3825,6 +4783,12 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_system_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -3842,7 +4806,13 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg( ; GFX1250-LABEL: flat_system_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -3871,11 +4841,16 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -3885,6 +4860,7 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -3901,11 +4877,16 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -3915,6 +4896,7 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -3932,11 +4914,16 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -3946,6 +4933,7 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -3959,11 +4947,16 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_system_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -3973,6 +4966,7 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -3987,6 +4981,12 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4005,6 +5005,12 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4021,6 +5027,12 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_system_release_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4037,6 +5049,12 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_system_release_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4053,6 +5071,12 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg( ; GFX11-WGP-LABEL: flat_system_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4070,6 +5094,12 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg( ; GFX11-CU-LABEL: flat_system_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4087,6 +5117,12 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_system_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -4107,6 +5143,12 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_system_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -4127,7 +5169,13 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg( ; GFX1250-LABEL: flat_system_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -4158,11 +5206,16 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -4172,6 +5225,7 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -4190,11 +5244,16 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -4204,6 +5263,7 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -4225,11 +5285,16 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -4239,6 +5304,7 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -4256,11 +5322,16 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_system_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -4270,6 +5341,7 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -4285,6 +5357,12 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4306,6 +5384,12 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4325,6 +5409,12 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_system_acq_rel_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4343,6 +5433,12 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_system_acq_rel_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4361,6 +5457,12 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg( ; GFX11-WGP-LABEL: flat_system_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4382,6 +5484,12 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg( ; GFX11-CU-LABEL: flat_system_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4403,6 +5511,12 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_system_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -4425,6 +5539,12 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_system_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -4447,7 +5567,13 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg( ; GFX1250-LABEL: flat_system_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -4481,11 +5607,16 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -4495,6 +5626,7 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -4513,11 +5645,16 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -4527,6 +5664,7 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -4548,11 +5686,16 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -4562,6 +5705,7 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -4579,11 +5723,16 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_system_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -4593,6 +5742,7 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -4608,6 +5758,12 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4629,6 +5785,12 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4648,6 +5810,12 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_system_seq_cst_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4666,6 +5834,12 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_system_seq_cst_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4684,6 +5858,12 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg( ; GFX11-WGP-LABEL: flat_system_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4705,6 +5885,12 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg( ; GFX11-CU-LABEL: flat_system_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4726,6 +5912,12 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_system_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -4748,6 +5940,12 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_system_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -4770,7 +5968,13 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg( ; GFX1250-LABEL: flat_system_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -4804,11 +6008,16 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -4818,6 +6027,7 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -4835,11 +6045,16 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -4849,6 +6064,7 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -4868,11 +6084,16 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -4882,6 +6103,7 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -4897,11 +6119,16 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_system_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -4911,6 +6138,7 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -4925,6 +6153,12 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4944,6 +6178,12 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4961,6 +6201,12 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_system_monotonic_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4977,6 +6223,12 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_system_monotonic_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4993,6 +6245,12 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg( ; GFX11-WGP-LABEL: flat_system_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5012,6 +6270,12 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg( ; GFX11-CU-LABEL: flat_system_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5031,6 +6295,12 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_system_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -5048,6 +6318,12 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_system_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -5065,7 +6341,13 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg( ; GFX1250-LABEL: flat_system_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -5094,11 +6376,16 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -5108,6 +6395,7 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -5125,11 +6413,16 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -5139,6 +6432,7 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -5158,11 +6452,16 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -5172,6 +6471,7 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -5187,11 +6487,16 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_system_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -5201,6 +6506,7 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -5215,6 +6521,12 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5234,6 +6546,12 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5251,6 +6569,12 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_system_acquire_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5267,6 +6591,12 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_system_acquire_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5283,6 +6613,12 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg( ; GFX11-WGP-LABEL: flat_system_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5302,6 +6638,12 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg( ; GFX11-CU-LABEL: flat_system_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5321,6 +6663,12 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_system_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -5338,6 +6686,12 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_system_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -5355,7 +6709,13 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg( ; GFX1250-LABEL: flat_system_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -5384,11 +6744,16 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -5398,6 +6763,7 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -5416,11 +6782,16 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -5430,6 +6801,7 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -5451,11 +6823,16 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -5465,6 +6842,7 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -5482,11 +6860,16 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_system_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -5496,6 +6879,7 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -5511,6 +6895,12 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5532,6 +6922,12 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5551,6 +6947,12 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_system_release_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5569,6 +6971,12 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_system_release_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5587,6 +6995,12 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg( ; GFX11-WGP-LABEL: flat_system_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5608,6 +7022,12 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg( ; GFX11-CU-LABEL: flat_system_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5629,6 +7049,12 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_system_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -5651,6 +7077,12 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_system_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -5673,7 +7105,13 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg( ; GFX1250-LABEL: flat_system_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -5707,11 +7145,16 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -5721,6 +7164,7 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -5739,11 +7183,16 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -5753,6 +7202,7 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -5774,11 +7224,16 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -5788,6 +7243,7 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -5805,11 +7261,16 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_system_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -5819,6 +7280,7 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -5834,6 +7296,12 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5855,6 +7323,12 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5874,6 +7348,12 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_system_acq_rel_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5892,6 +7372,12 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_system_acq_rel_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5910,6 +7396,12 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg( ; GFX11-WGP-LABEL: flat_system_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5931,6 +7423,12 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg( ; GFX11-CU-LABEL: flat_system_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5952,6 +7450,12 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_system_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -5974,6 +7478,12 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_system_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -5996,7 +7506,13 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg( ; GFX1250-LABEL: flat_system_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -6030,11 +7546,16 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -6044,6 +7565,7 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -6062,11 +7584,16 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -6076,6 +7603,7 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -6097,11 +7625,16 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -6111,6 +7644,7 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -6128,11 +7662,16 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_system_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -6142,6 +7681,7 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -6157,6 +7697,12 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6178,6 +7724,12 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6197,6 +7749,12 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_system_seq_cst_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6215,6 +7773,12 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_system_seq_cst_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6233,6 +7797,12 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg( ; GFX11-WGP-LABEL: flat_system_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -6254,6 +7824,12 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg( ; GFX11-CU-LABEL: flat_system_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6275,6 +7851,12 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_system_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -6297,6 +7879,12 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_system_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -6319,7 +7907,13 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg( ; GFX1250-LABEL: flat_system_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -6353,11 +7947,16 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -6367,6 +7966,7 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -6385,11 +7985,16 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -6399,6 +8004,7 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -6420,11 +8026,16 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -6434,6 +8045,7 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -6451,11 +8063,16 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_system_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -6465,6 +8082,7 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -6480,6 +8098,12 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6501,6 +8125,12 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6520,6 +8150,12 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_system_monotonic_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6538,6 +8174,12 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_system_monotonic_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6556,6 +8198,12 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: flat_system_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -6577,6 +8225,12 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg( ; GFX11-CU-LABEL: flat_system_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6598,6 +8252,12 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_system_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -6620,6 +8280,12 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_system_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -6642,7 +8308,13 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg( ; GFX1250-LABEL: flat_system_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -6676,11 +8348,16 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -6690,6 +8367,7 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -6708,11 +8386,16 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -6722,6 +8405,7 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -6743,11 +8427,16 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -6757,6 +8446,7 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -6774,11 +8464,16 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_system_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -6788,6 +8483,7 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -6803,6 +8499,12 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6824,6 +8526,12 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6843,6 +8551,12 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_system_acquire_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6861,6 +8575,12 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_system_acquire_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6879,6 +8599,12 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: flat_system_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -6900,6 +8626,12 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg( ; GFX11-CU-LABEL: flat_system_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6921,6 +8653,12 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_system_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -6943,6 +8681,12 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_system_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -6965,7 +8709,13 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg( ; GFX1250-LABEL: flat_system_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -6999,11 +8749,16 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -7013,6 +8768,7 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -7031,11 +8787,16 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -7045,6 +8806,7 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -7066,11 +8828,16 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -7080,6 +8847,7 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -7097,11 +8865,16 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_system_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -7111,6 +8884,7 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -7126,6 +8900,12 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7147,6 +8927,12 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7166,6 +8952,12 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_system_release_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7184,6 +8976,12 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_system_release_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7202,6 +9000,12 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: flat_system_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7223,6 +9027,12 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg( ; GFX11-CU-LABEL: flat_system_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -7244,6 +9054,12 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_system_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -7266,6 +9082,12 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_system_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -7288,7 +9110,13 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg( ; GFX1250-LABEL: flat_system_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -7322,11 +9150,16 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -7336,6 +9169,7 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -7354,11 +9188,16 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -7368,6 +9207,7 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -7389,11 +9229,16 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -7403,6 +9248,7 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -7420,11 +9266,16 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_system_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -7434,6 +9285,7 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -7449,6 +9301,12 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7470,6 +9328,12 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7489,6 +9353,12 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_system_acq_rel_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7507,6 +9377,12 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_system_acq_rel_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7525,6 +9401,12 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: flat_system_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7546,6 +9428,12 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg( ; GFX11-CU-LABEL: flat_system_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -7567,6 +9455,12 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_system_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -7589,6 +9483,12 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_system_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -7611,7 +9511,13 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg( ; GFX1250-LABEL: flat_system_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -7645,11 +9551,16 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -7659,6 +9570,7 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -7677,11 +9589,16 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -7691,6 +9608,7 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -7712,11 +9630,16 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -7726,6 +9649,7 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -7743,11 +9667,16 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_system_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -7757,6 +9686,7 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -7772,6 +9702,12 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7793,6 +9729,12 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7812,6 +9754,12 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_system_seq_cst_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7830,6 +9778,12 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_system_seq_cst_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7848,6 +9802,12 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: flat_system_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7869,6 +9829,12 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg( ; GFX11-CU-LABEL: flat_system_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -7890,6 +9856,12 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_system_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -7912,6 +9884,12 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_system_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -7934,7 +9912,13 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg( ; GFX1250-LABEL: flat_system_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -7969,6 +9953,12 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -8002,6 +9992,12 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -8035,6 +10031,12 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -8064,6 +10066,12 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -8094,6 +10102,12 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8113,6 +10127,12 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8130,6 +10150,12 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8147,6 +10173,12 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8164,6 +10196,12 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -8183,6 +10221,12 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -8202,6 +10246,12 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -8221,6 +10271,12 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -8240,7 +10296,13 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg( ; GFX1250-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -8271,6 +10333,12 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -8305,6 +10373,12 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -8340,6 +10414,12 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -8371,6 +10451,12 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -8401,6 +10487,12 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8422,6 +10514,12 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8441,6 +10539,12 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_system_acquire_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8459,6 +10563,12 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_system_acquire_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8477,6 +10587,12 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_system_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -8498,6 +10614,12 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: flat_system_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -8519,6 +10641,12 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_system_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -8539,6 +10667,12 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_system_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -8559,7 +10693,13 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg( ; GFX1250-LABEL: flat_system_acquire_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -8592,6 +10732,12 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -8626,6 +10772,12 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -8661,6 +10813,12 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -8692,6 +10850,12 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -8723,6 +10887,12 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8744,6 +10914,12 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8763,6 +10939,12 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_system_release_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8782,6 +10964,12 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_system_release_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8801,6 +10989,12 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_system_release_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -8822,6 +11016,12 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: flat_system_release_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -8843,6 +11043,12 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_system_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -8867,6 +11073,12 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_system_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -8891,7 +11103,13 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg( ; GFX1250-LABEL: flat_system_release_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -8927,6 +11145,12 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -8962,6 +11186,12 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -8999,6 +11229,12 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -9032,6 +11268,12 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -9063,6 +11305,12 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9086,6 +11334,12 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9107,6 +11361,12 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9127,6 +11387,12 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9147,6 +11413,12 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -9170,6 +11442,12 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -9193,6 +11471,12 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -9218,6 +11502,12 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -9243,7 +11533,13 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg( ; GFX1250-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -9281,6 +11577,12 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -9316,6 +11618,12 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -9353,6 +11661,12 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -9386,6 +11700,12 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -9417,6 +11737,12 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9440,6 +11766,12 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9461,6 +11793,12 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9481,6 +11819,12 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9501,6 +11845,12 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -9524,6 +11874,12 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -9547,6 +11903,12 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -9572,6 +11934,12 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -9597,7 +11965,13 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg( ; GFX1250-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -9635,6 +12009,12 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -9669,6 +12049,12 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -9704,6 +12090,12 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -9735,6 +12127,12 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -9765,6 +12163,12 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9786,6 +12190,12 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9805,6 +12215,12 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_system_monotonic_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9823,6 +12239,12 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_system_monotonic_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9841,6 +12263,12 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_system_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -9862,6 +12290,12 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: flat_system_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -9883,6 +12317,12 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_system_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -9903,6 +12343,12 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_system_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -9923,7 +12369,13 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg( ; GFX1250-LABEL: flat_system_monotonic_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -9956,6 +12408,12 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -9990,6 +12448,12 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -10025,6 +12489,12 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -10056,6 +12526,12 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -10086,6 +12562,12 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10107,6 +12589,12 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10126,6 +12614,12 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_system_acquire_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10144,6 +12638,12 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_system_acquire_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10162,6 +12662,12 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_system_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10183,6 +12689,12 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: flat_system_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10204,6 +12716,12 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_system_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -10224,6 +12742,12 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_system_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -10244,7 +12768,13 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg( ; GFX1250-LABEL: flat_system_acquire_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -10277,6 +12807,12 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -10312,6 +12848,12 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -10349,6 +12891,12 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -10382,6 +12930,12 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -10413,6 +12967,12 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10436,6 +12996,12 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10457,6 +13023,12 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_system_release_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10477,6 +13049,12 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_system_release_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10497,6 +13075,12 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_system_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10520,6 +13104,12 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: flat_system_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10543,6 +13133,12 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_system_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -10568,6 +13164,12 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_system_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -10593,7 +13195,13 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg( ; GFX1250-LABEL: flat_system_release_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -10631,6 +13239,12 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -10666,6 +13280,12 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -10703,6 +13323,12 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -10736,6 +13362,12 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -10767,6 +13399,12 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10790,6 +13428,12 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10811,6 +13455,12 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10831,6 +13481,12 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10851,6 +13507,12 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10874,6 +13536,12 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10897,6 +13565,12 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -10922,6 +13596,12 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -10947,7 +13627,13 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg( ; GFX1250-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -10985,6 +13671,12 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -11020,6 +13712,12 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -11057,6 +13755,12 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -11090,6 +13794,12 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -11121,6 +13831,12 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11144,6 +13860,12 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11165,6 +13887,12 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11185,6 +13913,12 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11205,6 +13939,12 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11228,6 +13968,12 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11251,6 +13997,12 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -11276,6 +14028,12 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -11301,7 +14059,13 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg( ; GFX1250-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -11339,6 +14103,12 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -11374,6 +14144,12 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -11411,6 +14187,12 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -11444,6 +14226,12 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -11475,6 +14263,12 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11498,6 +14292,12 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11519,6 +14319,12 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11539,6 +14345,12 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11559,6 +14371,12 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11582,6 +14400,12 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11605,6 +14429,12 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -11630,6 +14460,12 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -11655,7 +14491,13 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -11693,6 +14535,12 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -11728,6 +14576,12 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -11765,6 +14619,12 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -11798,6 +14658,12 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -11829,6 +14695,12 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11852,6 +14724,12 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11873,6 +14751,12 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11893,6 +14777,12 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11913,6 +14803,12 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11936,6 +14832,12 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11959,6 +14861,12 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -11984,6 +14892,12 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -12009,7 +14923,13 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -12047,6 +14967,12 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -12082,6 +15008,12 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -12119,6 +15051,12 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -12152,6 +15090,12 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -12183,6 +15127,12 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12206,6 +15156,12 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12227,6 +15183,12 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_system_release_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12247,6 +15209,12 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_system_release_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12267,6 +15235,12 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_system_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12290,6 +15264,12 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: flat_system_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12313,6 +15293,12 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_system_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -12338,6 +15324,12 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_system_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -12363,7 +15355,13 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: flat_system_release_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -12401,6 +15399,12 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -12436,6 +15440,12 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -12473,6 +15483,12 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -12506,6 +15522,12 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -12537,6 +15559,12 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12560,6 +15588,12 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12581,6 +15615,12 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12601,6 +15641,12 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12621,6 +15667,12 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12644,6 +15696,12 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12667,6 +15725,12 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -12692,6 +15756,12 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -12717,7 +15787,13 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -12755,6 +15831,12 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -12790,6 +15872,12 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -12827,6 +15915,12 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -12860,6 +15954,12 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -12891,6 +15991,12 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12914,6 +16020,12 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12935,6 +16047,12 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12955,6 +16073,12 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12975,6 +16099,12 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12998,6 +16128,12 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -13021,6 +16157,12 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -13046,6 +16188,12 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -13071,7 +16219,13 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -13107,15 +16261,19 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -13125,15 +16283,19 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -13143,29 +16305,37 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_unordered_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -13173,13 +16343,17 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -13187,11 +16361,15 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -13199,23 +16377,31 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load( ; ; GFX942-NOTTGSPLIT-LABEL: flat_system_one_as_unordered_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_system_one_as_unordered_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -13223,40 +16409,52 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load( ; ; GFX11-WGP-LABEL: flat_system_one_as_unordered_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_system_one_as_unordered_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_system_one_as_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -13265,12 +16463,16 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load( ; ; GFX12-CU-LABEL: flat_system_one_as_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -13280,12 +16482,16 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load( ; GFX1250-LABEL: flat_system_one_as_unordered_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { @@ -13301,15 +16507,19 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] glc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -13319,15 +16529,19 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc dlc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -13337,29 +16551,37 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] glc dlc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_monotonic_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -13367,13 +16589,17 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -13381,11 +16607,15 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -13393,23 +16623,31 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load( ; ; GFX942-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_system_one_as_monotonic_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -13417,40 +16655,52 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load( ; ; GFX11-WGP-LABEL: flat_system_one_as_monotonic_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_system_one_as_monotonic_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_system_one_as_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_SYS +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -13459,12 +16709,16 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load( ; ; GFX12-CU-LABEL: flat_system_one_as_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_SYS +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -13474,12 +16728,16 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load( ; GFX1250-LABEL: flat_system_one_as_monotonic_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { @@ -13495,17 +16753,20 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -13515,18 +16776,21 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc dlc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -13536,33 +16800,38 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] glc dlc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_acquire_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] glc -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -13570,16 +16839,19 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -13587,89 +16859,110 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 sc1 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 -; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_system_one_as_acquire_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 sc1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_system_one_as_acquire_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_system_one_as_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_system_one_as_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0 @@ -13678,14 +16971,18 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load( ; ; GFX12-CU-LABEL: flat_system_one_as_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 @@ -13695,14 +16992,18 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load( ; GFX1250-LABEL: flat_system_one_as_acquire_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { @@ -13718,18 +17019,21 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_load_dword v2, v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -13739,9 +17043,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) @@ -13750,9 +17057,9 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -13762,9 +17069,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) @@ -13773,25 +17083,27 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] glc -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -13799,17 +17111,20 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -13817,53 +17132,67 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 sc1 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 -; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_system_one_as_seq_cst_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 sc1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_system_one_as_seq_cst_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) @@ -13872,17 +17201,20 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_system_one_as_seq_cst_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) @@ -13891,17 +17223,20 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_system_one_as_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 @@ -13911,6 +17246,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load( ; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0 @@ -13919,9 +17255,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load( ; ; GFX12-CU-LABEL: flat_system_one_as_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 @@ -13931,6 +17270,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load( ; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 @@ -13940,16 +17280,20 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load( ; GFX1250-LABEL: flat_system_one_as_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { @@ -13966,6 +17310,10 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -13981,6 +17329,10 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 @@ -13996,6 +17348,10 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 @@ -14007,6 +17363,10 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store( ; SKIP-CACHE-INV-LABEL: flat_system_one_as_unordered_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -14020,6 +17380,10 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -14032,6 +17396,10 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -14042,6 +17410,10 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store( ; GFX942-NOTTGSPLIT-LABEL: flat_system_one_as_unordered_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -14052,6 +17424,10 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store( ; GFX942-TGSPLIT-LABEL: flat_system_one_as_unordered_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -14062,6 +17438,10 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store( ; GFX11-WGP-LABEL: flat_system_one_as_unordered_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -14073,6 +17453,10 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store( ; GFX11-CU-LABEL: flat_system_one_as_unordered_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -14084,6 +17468,10 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store( ; GFX12-WGP-LABEL: flat_system_one_as_unordered_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -14095,6 +17483,10 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store( ; GFX12-CU-LABEL: flat_system_one_as_unordered_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -14106,12 +17498,16 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store( ; GFX1250-LABEL: flat_system_one_as_unordered_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { @@ -14127,6 +17523,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -14142,6 +17542,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 @@ -14157,6 +17561,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 @@ -14168,6 +17576,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store( ; SKIP-CACHE-INV-LABEL: flat_system_one_as_monotonic_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -14181,6 +17593,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -14193,6 +17609,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -14203,6 +17623,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store( ; GFX942-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -14213,6 +17637,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store( ; GFX942-TGSPLIT-LABEL: flat_system_one_as_monotonic_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -14223,6 +17651,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store( ; GFX11-WGP-LABEL: flat_system_one_as_monotonic_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -14234,6 +17666,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store( ; GFX11-CU-LABEL: flat_system_one_as_monotonic_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -14245,6 +17681,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store( ; GFX12-WGP-LABEL: flat_system_one_as_monotonic_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -14256,6 +17696,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store( ; GFX12-CU-LABEL: flat_system_one_as_monotonic_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -14267,12 +17711,16 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store( ; GFX1250-LABEL: flat_system_one_as_monotonic_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { @@ -14288,6 +17736,10 @@ define amdgpu_kernel void @flat_system_one_as_release_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -14304,6 +17756,10 @@ define amdgpu_kernel void @flat_system_one_as_release_store( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 @@ -14321,6 +17777,10 @@ define amdgpu_kernel void @flat_system_one_as_release_store( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 @@ -14334,6 +17794,10 @@ define amdgpu_kernel void @flat_system_one_as_release_store( ; SKIP-CACHE-INV-LABEL: flat_system_one_as_release_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -14348,6 +17812,10 @@ define amdgpu_kernel void @flat_system_one_as_release_store( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -14362,6 +17830,10 @@ define amdgpu_kernel void @flat_system_one_as_release_store( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -14374,6 +17846,10 @@ define amdgpu_kernel void @flat_system_one_as_release_store( ; GFX942-NOTTGSPLIT-LABEL: flat_system_one_as_release_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -14386,6 +17862,10 @@ define amdgpu_kernel void @flat_system_one_as_release_store( ; GFX942-TGSPLIT-LABEL: flat_system_one_as_release_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -14398,6 +17878,10 @@ define amdgpu_kernel void @flat_system_one_as_release_store( ; GFX11-WGP-LABEL: flat_system_one_as_release_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -14411,6 +17895,10 @@ define amdgpu_kernel void @flat_system_one_as_release_store( ; GFX11-CU-LABEL: flat_system_one_as_release_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -14424,6 +17912,10 @@ define amdgpu_kernel void @flat_system_one_as_release_store( ; GFX12-WGP-LABEL: flat_system_one_as_release_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -14440,6 +17932,10 @@ define amdgpu_kernel void @flat_system_one_as_release_store( ; GFX12-CU-LABEL: flat_system_one_as_release_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -14456,10 +17952,13 @@ define amdgpu_kernel void @flat_system_one_as_release_store( ; GFX1250-LABEL: flat_system_one_as_release_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -14467,6 +17966,7 @@ define amdgpu_kernel void @flat_system_one_as_release_store( ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { @@ -14482,6 +17982,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -14498,6 +18002,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 @@ -14515,6 +18023,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 @@ -14528,6 +18040,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store( ; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -14542,6 +18058,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -14556,6 +18076,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -14568,6 +18092,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store( ; GFX942-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -14580,6 +18108,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store( ; GFX942-TGSPLIT-LABEL: flat_system_one_as_seq_cst_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -14592,6 +18124,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store( ; GFX11-WGP-LABEL: flat_system_one_as_seq_cst_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -14605,6 +18141,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store( ; GFX11-CU-LABEL: flat_system_one_as_seq_cst_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -14618,6 +18158,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store( ; GFX12-WGP-LABEL: flat_system_one_as_seq_cst_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -14634,6 +18178,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store( ; GFX12-CU-LABEL: flat_system_one_as_seq_cst_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -14650,10 +18198,13 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store( ; GFX1250-LABEL: flat_system_one_as_seq_cst_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -14661,6 +18212,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store( ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { @@ -14675,11 +18227,15 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm @@ -14690,11 +18246,15 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm @@ -14705,22 +18265,30 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_monotonic_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -14729,10 +18297,14 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -14741,74 +18313,102 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 sc1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_system_one_as_monotonic_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 sc1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_system_one_as_monotonic_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_system_one_as_monotonic_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_system_one_as_monotonic_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_system_one_as_monotonic_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -14816,7 +18416,11 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw( ; GFX1250-LABEL: flat_system_one_as_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -14836,11 +18440,15 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -14853,11 +18461,15 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -14871,11 +18483,15 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -14885,11 +18501,15 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_acquire_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -14899,10 +18519,14 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -14914,10 +18538,14 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -14927,10 +18555,14 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 sc1 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -14939,10 +18571,14 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: flat_system_one_as_acquire_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 sc1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -14951,11 +18587,15 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw( ; ; GFX11-WGP-LABEL: flat_system_one_as_acquire_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -14965,11 +18605,15 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw( ; ; GFX11-CU-LABEL: flat_system_one_as_acquire_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -14979,11 +18623,15 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw( ; ; GFX12-WGP-LABEL: flat_system_one_as_acquire_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -14992,11 +18640,15 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw( ; ; GFX12-CU-LABEL: flat_system_one_as_acquire_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 @@ -15006,7 +18658,11 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw( ; GFX1250-LABEL: flat_system_one_as_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -15029,11 +18685,15 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 @@ -15045,11 +18705,15 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -15062,11 +18726,15 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -15075,11 +18743,15 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_release_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 @@ -15089,10 +18761,14 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -15103,10 +18779,14 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -15115,10 +18795,14 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: flat_system_one_as_release_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -15127,10 +18811,14 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: flat_system_one_as_release_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -15139,11 +18827,15 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw( ; ; GFX11-WGP-LABEL: flat_system_one_as_release_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -15152,11 +18844,15 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw( ; ; GFX11-CU-LABEL: flat_system_one_as_release_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -15165,11 +18861,15 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw( ; ; GFX12-WGP-LABEL: flat_system_one_as_release_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 @@ -15181,11 +18881,15 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw( ; ; GFX12-CU-LABEL: flat_system_one_as_release_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 @@ -15198,7 +18902,11 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw( ; GFX1250-LABEL: flat_system_one_as_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -15223,11 +18931,15 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 @@ -15241,11 +18953,15 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -15261,11 +18977,15 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -15277,11 +18997,15 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_acq_rel_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 @@ -15292,10 +19016,14 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -15309,10 +19037,14 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -15324,10 +19056,14 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -15338,10 +19074,14 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: flat_system_one_as_acq_rel_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -15352,11 +19092,15 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw( ; ; GFX11-WGP-LABEL: flat_system_one_as_acq_rel_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -15368,11 +19112,15 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw( ; ; GFX11-CU-LABEL: flat_system_one_as_acq_rel_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -15384,11 +19132,15 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw( ; ; GFX12-WGP-LABEL: flat_system_one_as_acq_rel_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 @@ -15402,11 +19154,15 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw( ; ; GFX12-CU-LABEL: flat_system_one_as_acq_rel_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 @@ -15421,7 +19177,11 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw( ; GFX1250-LABEL: flat_system_one_as_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -15449,11 +19209,15 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 @@ -15467,11 +19231,15 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -15487,11 +19255,15 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -15503,11 +19275,15 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 @@ -15518,10 +19294,14 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -15535,10 +19315,14 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -15550,10 +19334,14 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -15564,10 +19352,14 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: flat_system_one_as_seq_cst_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -15578,11 +19370,15 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw( ; ; GFX11-WGP-LABEL: flat_system_one_as_seq_cst_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -15594,11 +19390,15 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw( ; ; GFX11-CU-LABEL: flat_system_one_as_seq_cst_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -15610,11 +19410,15 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw( ; ; GFX12-WGP-LABEL: flat_system_one_as_seq_cst_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 @@ -15628,11 +19432,15 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw( ; ; GFX12-CU-LABEL: flat_system_one_as_seq_cst_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 @@ -15647,7 +19455,11 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw( ; GFX1250-LABEL: flat_system_one_as_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -15676,6 +19488,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -15697,6 +19513,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -15719,6 +19539,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -15737,6 +19561,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: flat_system_one_as_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -15755,6 +19583,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] @@ -15773,6 +19605,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] @@ -15788,6 +19624,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -15803,6 +19643,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: flat_system_one_as_acquire_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -15817,6 +19661,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw( ; GFX11-WGP-LABEL: flat_system_one_as_acquire_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -15835,6 +19683,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw( ; GFX11-CU-LABEL: flat_system_one_as_acquire_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -15853,6 +19705,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw( ; GFX12-WGP-LABEL: flat_system_one_as_acquire_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -15870,6 +19726,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw( ; GFX12-CU-LABEL: flat_system_one_as_acquire_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -15887,7 +19747,11 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw( ; GFX1250-LABEL: flat_system_one_as_acquire_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -15913,6 +19777,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -15935,6 +19803,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -15959,6 +19831,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -15979,6 +19855,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -15998,6 +19878,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] @@ -16018,6 +19902,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] @@ -16035,6 +19923,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -16052,6 +19944,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -16068,6 +19964,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw( ; GFX11-WGP-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -16088,6 +19988,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw( ; GFX11-CU-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -16108,6 +20012,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw( ; GFX12-WGP-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -16130,6 +20038,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -16152,7 +20064,11 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw( ; GFX1250-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -16183,6 +20099,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -16205,6 +20125,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -16229,6 +20153,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -16249,6 +20177,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -16268,6 +20200,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] @@ -16288,6 +20224,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] @@ -16305,6 +20245,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -16322,6 +20266,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -16338,6 +20286,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw( ; GFX11-WGP-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -16358,6 +20310,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw( ; GFX11-CU-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -16378,6 +20334,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw( ; GFX12-WGP-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -16400,6 +20360,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -16422,7 +20386,11 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw( ; GFX1250-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -16453,11 +20421,16 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -16467,6 +20440,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -16482,11 +20456,16 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -16496,6 +20475,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -16511,11 +20491,16 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -16525,6 +20510,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -16536,11 +20522,16 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -16550,6 +20541,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -16563,6 +20555,12 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16579,6 +20577,12 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16593,6 +20597,12 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16607,6 +20617,12 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16621,6 +20637,12 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg( ; GFX11-WGP-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16636,6 +20658,12 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg( ; GFX11-CU-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16651,6 +20679,12 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -16666,6 +20700,12 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -16681,7 +20721,13 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg( ; GFX1250-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -16707,11 +20753,16 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -16721,6 +20772,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -16738,11 +20790,16 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -16752,6 +20809,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -16770,11 +20828,16 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -16784,6 +20847,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -16798,11 +20862,16 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -16812,6 +20881,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -16826,6 +20896,12 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16845,6 +20921,12 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16862,6 +20944,12 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16878,6 +20966,12 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16894,6 +20988,12 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg( ; GFX11-WGP-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16912,6 +21012,12 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg( ; GFX11-CU-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16930,6 +21036,12 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -16947,6 +21059,12 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -16964,7 +21082,13 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg( ; GFX1250-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -16993,11 +21117,16 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -17007,6 +21136,7 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -17023,11 +21153,16 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -17037,6 +21172,7 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -17054,11 +21190,16 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -17068,6 +21209,7 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -17081,11 +21223,16 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_system_one_as_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -17095,6 +21242,7 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -17109,6 +21257,12 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17127,6 +21281,12 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17143,6 +21303,12 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_system_one_as_release_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17159,6 +21325,12 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_system_one_as_release_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17175,6 +21347,12 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg( ; GFX11-WGP-LABEL: flat_system_one_as_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17192,6 +21370,12 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg( ; GFX11-CU-LABEL: flat_system_one_as_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17209,6 +21393,12 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_system_one_as_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -17229,6 +21419,12 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_system_one_as_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -17249,7 +21445,13 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg( ; GFX1250-LABEL: flat_system_one_as_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -17280,11 +21482,16 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -17294,6 +21501,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -17312,11 +21520,16 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -17326,6 +21539,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -17346,11 +21560,16 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -17360,6 +21579,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -17376,11 +21596,16 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -17390,6 +21615,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -17405,6 +21631,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17426,6 +21658,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17445,6 +21683,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17463,6 +21707,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17481,6 +21731,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX11-WGP-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17501,6 +21757,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX11-CU-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17521,6 +21783,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -17543,6 +21811,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -17565,7 +21839,13 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX1250-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -17599,11 +21879,16 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -17613,6 +21898,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -17631,11 +21917,16 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -17645,6 +21936,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -17665,11 +21957,16 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -17679,6 +21976,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -17695,11 +21993,16 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -17709,6 +22012,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -17724,6 +22028,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17745,6 +22055,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17764,6 +22080,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17782,6 +22104,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17800,6 +22128,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX11-WGP-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17820,6 +22154,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX11-CU-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17840,6 +22180,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -17862,6 +22208,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -17884,7 +22236,13 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX1250-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -17918,11 +22276,16 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -17932,6 +22295,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -17949,11 +22313,16 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -17963,6 +22332,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -17981,11 +22351,16 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -17995,6 +22370,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -18009,11 +22385,16 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -18023,6 +22404,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -18037,6 +22419,12 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18056,6 +22444,12 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18073,6 +22467,12 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18089,6 +22489,12 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18105,6 +22511,12 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg( ; GFX11-WGP-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -18123,6 +22535,12 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg( ; GFX11-CU-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -18141,6 +22559,12 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -18158,6 +22582,12 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -18175,7 +22605,13 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg( ; GFX1250-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -18204,11 +22640,16 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -18218,6 +22659,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -18235,11 +22677,16 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -18249,6 +22696,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -18267,11 +22715,16 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -18281,6 +22734,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -18295,11 +22749,16 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_system_one_as_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -18309,6 +22768,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -18323,6 +22783,12 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18342,6 +22808,12 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18359,6 +22831,12 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18375,6 +22853,12 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_system_one_as_acquire_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18391,6 +22875,12 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg( ; GFX11-WGP-LABEL: flat_system_one_as_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -18409,6 +22899,12 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg( ; GFX11-CU-LABEL: flat_system_one_as_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -18427,6 +22923,12 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_system_one_as_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -18444,6 +22946,12 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_system_one_as_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -18461,7 +22969,13 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg( ; GFX1250-LABEL: flat_system_one_as_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -18490,11 +23004,16 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -18504,6 +23023,7 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -18522,11 +23042,16 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -18536,6 +23061,7 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -18556,11 +23082,16 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -18570,6 +23101,7 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -18586,11 +23118,16 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_system_one_as_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -18600,6 +23137,7 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -18615,6 +23153,12 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18636,6 +23180,12 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18655,6 +23205,12 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_system_one_as_release_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18673,6 +23229,12 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_system_one_as_release_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18691,6 +23253,12 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg( ; GFX11-WGP-LABEL: flat_system_one_as_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -18711,6 +23279,12 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg( ; GFX11-CU-LABEL: flat_system_one_as_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -18731,6 +23305,12 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_system_one_as_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -18753,6 +23333,12 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_system_one_as_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -18775,7 +23361,13 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg( ; GFX1250-LABEL: flat_system_one_as_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -18809,11 +23401,16 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -18823,6 +23420,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -18841,11 +23439,16 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -18855,6 +23458,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -18875,11 +23479,16 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -18889,6 +23498,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -18905,11 +23515,16 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -18919,6 +23534,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -18934,6 +23550,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18955,6 +23577,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18974,6 +23602,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18992,6 +23626,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19010,6 +23650,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg( ; GFX11-WGP-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -19030,6 +23676,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg( ; GFX11-CU-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -19050,6 +23702,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -19072,6 +23730,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -19094,7 +23758,13 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg( ; GFX1250-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -19128,11 +23798,16 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -19142,6 +23817,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -19160,11 +23836,16 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -19174,6 +23855,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -19194,11 +23876,16 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -19208,6 +23895,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -19224,11 +23912,16 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -19238,6 +23931,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -19253,6 +23947,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19274,6 +23974,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19293,6 +23999,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19311,6 +24023,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19329,6 +24047,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg( ; GFX11-WGP-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -19349,6 +24073,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg( ; GFX11-CU-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -19369,6 +24099,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -19391,6 +24127,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -19413,7 +24155,13 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg( ; GFX1250-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -19447,11 +24195,16 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -19461,6 +24214,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -19479,11 +24233,16 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -19493,6 +24252,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -19513,11 +24273,16 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -19527,6 +24292,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -19543,11 +24309,16 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -19557,6 +24328,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -19572,6 +24344,12 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19593,6 +24371,12 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19612,6 +24396,12 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19630,6 +24420,12 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19648,6 +24444,12 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -19668,6 +24470,12 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX11-CU-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -19688,6 +24496,12 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -19710,6 +24524,12 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -19732,7 +24552,13 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX1250-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -19766,11 +24592,16 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -19780,6 +24611,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -19798,11 +24630,16 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -19812,6 +24649,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -19832,11 +24670,16 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -19846,6 +24689,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -19862,11 +24706,16 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -19876,6 +24725,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -19891,6 +24741,12 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19912,6 +24768,12 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19931,6 +24793,12 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19949,6 +24817,12 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19967,6 +24841,12 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -19987,6 +24867,12 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg( ; GFX11-CU-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -20007,6 +24893,12 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -20029,6 +24921,12 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -20051,7 +24949,13 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg( ; GFX1250-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -20085,11 +24989,16 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -20099,6 +25008,7 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -20117,11 +25027,16 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -20131,6 +25046,7 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -20151,11 +25067,16 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -20165,6 +25086,7 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -20181,11 +25103,16 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_system_one_as_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -20195,6 +25122,7 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -20210,6 +25138,12 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20231,6 +25165,12 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20250,6 +25190,12 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_system_one_as_release_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20268,6 +25214,12 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_system_one_as_release_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20286,6 +25238,12 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: flat_system_one_as_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -20306,6 +25264,12 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg( ; GFX11-CU-LABEL: flat_system_one_as_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -20326,6 +25290,12 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_system_one_as_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -20348,6 +25318,12 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_system_one_as_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -20370,7 +25346,13 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg( ; GFX1250-LABEL: flat_system_one_as_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -20404,11 +25386,16 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -20418,6 +25405,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -20436,11 +25424,16 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -20450,6 +25443,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -20470,11 +25464,16 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -20484,6 +25483,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -20500,11 +25500,16 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -20514,6 +25519,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -20529,6 +25535,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20550,6 +25562,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20569,6 +25587,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20587,6 +25611,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20605,6 +25635,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -20625,6 +25661,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX11-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -20645,6 +25687,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -20667,6 +25715,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -20689,7 +25743,13 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX1250-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -20723,11 +25783,16 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -20737,6 +25802,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -20755,11 +25821,16 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -20769,6 +25840,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -20789,11 +25861,16 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -20803,6 +25880,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -20819,11 +25897,16 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -20833,6 +25916,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -20848,6 +25932,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20869,6 +25959,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20888,6 +25984,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20906,6 +26008,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20924,6 +26032,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -20944,6 +26058,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX11-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -20964,6 +26084,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -20986,6 +26112,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -21008,7 +26140,13 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX1250-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -21043,6 +26181,12 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -21076,6 +26220,12 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -21109,6 +26259,12 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -21138,6 +26294,12 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -21168,6 +26330,12 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21187,6 +26355,12 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21204,6 +26378,12 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21221,6 +26401,12 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21238,6 +26424,12 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -21257,6 +26449,12 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -21276,6 +26474,12 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -21295,6 +26499,12 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -21314,7 +26524,13 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX1250-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -21345,6 +26561,12 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -21380,6 +26602,12 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -21416,6 +26644,12 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -21448,6 +26682,12 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -21479,6 +26719,12 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21501,6 +26747,12 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21520,6 +26772,12 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21539,6 +26797,12 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21557,6 +26821,12 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -21579,6 +26849,12 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -21601,6 +26877,12 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -21622,6 +26904,12 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -21643,7 +26931,13 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX1250-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -21676,6 +26970,12 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -21710,6 +27010,12 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -21745,6 +27051,12 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -21776,6 +27088,12 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -21807,6 +27125,12 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21828,6 +27152,12 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21847,6 +27177,12 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21866,6 +27202,12 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21885,6 +27227,12 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -21906,6 +27254,12 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -21927,6 +27281,12 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -21951,6 +27311,12 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -21975,7 +27341,13 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg( ; GFX1250-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -22011,6 +27383,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -22047,6 +27425,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -22085,6 +27469,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -22119,6 +27509,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -22151,6 +27547,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -22175,6 +27577,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -22196,6 +27604,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -22217,6 +27631,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -22237,6 +27657,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -22261,6 +27687,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -22285,6 +27717,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -22311,6 +27749,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -22337,7 +27781,13 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX1250-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -22375,6 +27825,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -22411,6 +27867,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -22449,6 +27911,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -22483,6 +27951,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -22515,6 +27989,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -22539,6 +28019,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -22560,6 +28046,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -22581,6 +28073,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -22601,6 +28099,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -22625,6 +28129,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -22649,6 +28159,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -22675,6 +28191,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -22701,7 +28223,13 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX1250-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -22739,6 +28267,12 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -22774,6 +28308,12 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -22810,6 +28350,12 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -22842,6 +28388,12 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -22873,6 +28425,12 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -22895,6 +28453,12 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -22914,6 +28478,12 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -22933,6 +28503,12 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -22951,6 +28527,12 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -22973,6 +28555,12 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -22995,6 +28583,12 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -23016,6 +28610,12 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -23037,7 +28637,13 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX1250-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -23070,6 +28676,12 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -23105,6 +28717,12 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -23141,6 +28759,12 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -23173,6 +28797,12 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -23204,6 +28834,12 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -23226,6 +28862,12 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -23245,6 +28887,12 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -23264,6 +28912,12 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -23282,6 +28936,12 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -23304,6 +28964,12 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -23326,6 +28992,12 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -23347,6 +29019,12 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -23368,7 +29046,13 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX1250-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -23401,6 +29085,12 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -23437,6 +29127,12 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -23475,6 +29171,12 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -23509,6 +29211,12 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -23541,6 +29249,12 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -23565,6 +29279,12 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -23586,6 +29306,12 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -23607,6 +29333,12 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -23627,6 +29359,12 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -23651,6 +29389,12 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -23675,6 +29419,12 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -23701,6 +29451,12 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -23727,7 +29483,13 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg( ; GFX1250-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -23765,6 +29527,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -23801,6 +29569,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -23839,6 +29613,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -23873,6 +29653,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -23905,6 +29691,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -23929,6 +29721,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -23950,6 +29748,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -23971,6 +29775,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -23991,6 +29801,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -24015,6 +29831,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -24039,6 +29861,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -24065,6 +29893,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -24091,7 +29925,13 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX1250-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -24129,6 +29969,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -24165,6 +30011,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -24203,6 +30055,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -24237,6 +30095,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -24269,6 +30133,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -24293,6 +30163,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -24314,6 +30190,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -24335,6 +30217,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -24355,6 +30243,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -24379,6 +30273,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -24403,6 +30303,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -24429,6 +30335,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -24455,7 +30367,13 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX1250-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -24493,6 +30411,12 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -24529,6 +30453,12 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -24567,6 +30497,12 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -24601,6 +30537,12 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -24633,6 +30575,12 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -24657,6 +30605,12 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -24678,6 +30632,12 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -24699,6 +30659,12 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -24719,6 +30685,12 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -24743,6 +30715,12 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -24767,6 +30745,12 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -24793,6 +30777,12 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -24819,7 +30809,13 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -24857,6 +30853,12 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -24893,6 +30895,12 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -24931,6 +30939,12 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -24965,6 +30979,12 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -24997,6 +31017,12 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -25021,6 +31047,12 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -25042,6 +31074,12 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -25063,6 +31101,12 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -25083,6 +31127,12 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -25107,6 +31157,12 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -25131,6 +31187,12 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -25157,6 +31219,12 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -25183,7 +31251,13 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -25221,6 +31295,12 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -25257,6 +31337,12 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -25295,6 +31381,12 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -25329,6 +31421,12 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -25361,6 +31459,12 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -25385,6 +31489,12 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -25406,6 +31516,12 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -25427,6 +31543,12 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -25447,6 +31569,12 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -25471,6 +31599,12 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -25495,6 +31629,12 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -25521,6 +31661,12 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -25547,7 +31693,13 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -25585,6 +31737,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -25621,6 +31779,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -25659,6 +31823,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -25693,6 +31863,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -25725,6 +31901,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -25749,6 +31931,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -25770,6 +31958,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -25791,6 +31985,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -25811,6 +32011,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -25835,6 +32041,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -25859,6 +32071,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -25885,6 +32103,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -25911,7 +32135,13 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -25949,6 +32179,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -25985,6 +32221,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -26023,6 +32265,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -26057,6 +32305,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -26089,6 +32343,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -26113,6 +32373,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -26134,6 +32400,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -26155,6 +32427,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -26175,6 +32453,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -26199,6 +32483,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -26223,6 +32513,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -26249,6 +32545,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -26275,7 +32577,13 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll index 37a26c4bb394..a832c376bdfe 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll @@ -15,16 +15,18 @@ define amdgpu_kernel void @flat_nontemporal_load_0( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] glc -; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -34,16 +36,18 @@ define amdgpu_kernel void @flat_nontemporal_load_0( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc dlc -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -53,75 +57,87 @@ define amdgpu_kernel void @flat_nontemporal_load_0( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] glc dlc -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_nontemporal_load_0: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] glc -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_nontemporal_load_0: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc dlc -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_nontemporal_load_0: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc dlc -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_nontemporal_load_0: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0 @@ -130,15 +146,19 @@ define amdgpu_kernel void @flat_nontemporal_load_0( ; ; GFX12-CU-LABEL: flat_nontemporal_load_0: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 @@ -148,13 +168,17 @@ define amdgpu_kernel void @flat_nontemporal_load_0( ; GFX1250-LABEL: flat_nontemporal_load_0: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { @@ -171,14 +195,17 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 -; GFX7-NEXT: s_mov_b32 s6, 2 -; GFX7-NEXT: v_lshlrev_b32_e64 v1, s6, v0 -; GFX7-NEXT: v_mov_b32_e32 v0, 0 -; GFX7-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_ashrrev_i32_e64 v2, 31, v0 +; GFX7-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 2 +; GFX7-NEXT: v_lshl_b64 v[1:2], v[0:1], s6 ; GFX7-NEXT: s_mov_b32 s6, s8 ; GFX7-NEXT: v_mov_b32_e32 v0, v1 ; GFX7-NEXT: s_mov_b32 s8, s9 @@ -203,15 +230,18 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_mov_b32 s6, 2 -; GFX10-WGP-NEXT: v_lshlrev_b32_e64 v1, s6, v0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-WGP-NEXT: v_ashrrev_i32_e64 v2, 31, v0 +; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v2 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_mov_b32 s6, 2 +; GFX10-WGP-NEXT: v_lshlrev_b64 v[1:2], s6, v[0:1] ; GFX10-WGP-NEXT: s_mov_b32 s7, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, v1 ; GFX10-WGP-NEXT: s_mov_b32 s6, s9 @@ -235,15 +265,18 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 -; GFX10-CU-NEXT: s_mov_b32 s6, 2 -; GFX10-CU-NEXT: v_lshlrev_b32_e64 v1, s6, v0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; GFX10-CU-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-CU-NEXT: v_ashrrev_i32_e64 v2, 31, v0 +; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v1, v2 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_mov_b32 s6, 2 +; GFX10-CU-NEXT: v_lshlrev_b64 v[1:2], s6, v[0:1] ; GFX10-CU-NEXT: s_mov_b32 s7, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, v1 ; GFX10-CU-NEXT: s_mov_b32 s6, s9 @@ -263,14 +296,17 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; SKIP-CACHE-INV-LABEL: flat_nontemporal_load_1: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, 2 -; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e64 v1, s2, v0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, 0 -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, v0 +; SKIP-CACHE-INV-NEXT: v_ashrrev_i32_e64 v2, 31, v0 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, 2 +; SKIP-CACHE-INV-NEXT: v_lshl_b64 v[1:2], v[0:1], s2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s5 @@ -291,16 +327,19 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX11-WGP-LABEL: flat_nontemporal_load_1: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_mov_b32 s2, 0x3ff ; GFX11-WGP-NEXT: v_and_b32_e64 v0, v0, s2 +; GFX11-WGP-NEXT: v_ashrrev_i32_e64 v2, 31, v0 +; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v2 ; GFX11-WGP-NEXT: s_mov_b32 s2, 2 -; GFX11-WGP-NEXT: v_lshlrev_b32_e64 v1, s2, v0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v0 -; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_lshlrev_b64 v[1:2], s2, v[0:1] ; GFX11-WGP-NEXT: s_mov_b32 s3, s4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, v1 ; GFX11-WGP-NEXT: s_mov_b32 s2, s5 @@ -320,16 +359,19 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX11-CU-LABEL: flat_nontemporal_load_1: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_mov_b32 s2, 0x3ff ; GFX11-CU-NEXT: v_and_b32_e64 v0, v0, s2 +; GFX11-CU-NEXT: v_ashrrev_i32_e64 v2, 31, v0 +; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v1, v2 ; GFX11-CU-NEXT: s_mov_b32 s2, 2 -; GFX11-CU-NEXT: v_lshlrev_b32_e64 v1, s2, v0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; GFX11-CU-NEXT: v_mov_b32_e32 v2, v0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_lshlrev_b64 v[1:2], s2, v[0:1] ; GFX11-CU-NEXT: s_mov_b32 s3, s4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, v1 ; GFX11-CU-NEXT: s_mov_b32 s2, s5 @@ -349,17 +391,20 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX12-WGP-LABEL: flat_nontemporal_load_1: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX12-WGP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_mov_b32 s2, 0x3ff ; GFX12-WGP-NEXT: v_and_b32_e64 v0, v0, s2 +; GFX12-WGP-NEXT: v_ashrrev_i32_e64 v2, 31, v0 +; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v2 ; GFX12-WGP-NEXT: s_mov_b32 s2, 2 ; GFX12-WGP-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-WGP-NEXT: v_lshlrev_b32_e64 v1, s2, v0 -; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v0 -; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_lshlrev_b64_e64 v[1:2], s2, v[0:1] ; GFX12-WGP-NEXT: s_mov_b32 s3, s4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, v1 ; GFX12-WGP-NEXT: s_mov_b32 s2, s5 @@ -383,17 +428,20 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX12-CU-LABEL: flat_nontemporal_load_1: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX12-CU-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_mov_b32 s2, 0x3ff ; GFX12-CU-NEXT: v_and_b32_e64 v0, v0, s2 +; GFX12-CU-NEXT: v_ashrrev_i32_e64 v2, 31, v0 +; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v1, v2 ; GFX12-CU-NEXT: s_mov_b32 s2, 2 ; GFX12-CU-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-CU-NEXT: v_lshlrev_b32_e64 v1, s2, v0 -; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; GFX12-CU-NEXT: v_mov_b32_e32 v2, v0 -; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_lshlrev_b64_e64 v[1:2], s2, v[0:1] ; GFX12-CU-NEXT: s_mov_b32 s3, s4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, v1 ; GFX12-CU-NEXT: s_mov_b32 s2, s5 @@ -418,16 +466,20 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: v_mov_b32_e32 v1, v0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b32 s4, 0x3ff ; GFX1250-NEXT: v_and_b32_e64 v1, v1, s4 ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_load_b32 v1, v1, s[2:3] scale_offset scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { @@ -445,15 +497,19 @@ define amdgpu_kernel void @flat_nontemporal_store_0( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm @@ -464,15 +520,19 @@ define amdgpu_kernel void @flat_nontemporal_store_0( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: s_endpgm @@ -483,72 +543,92 @@ define amdgpu_kernel void @flat_nontemporal_store_0( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_nontemporal_store_0: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_nontemporal_store_0: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 dlc ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_nontemporal_store_0: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 dlc ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_nontemporal_store_0: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 @@ -562,12 +642,16 @@ define amdgpu_kernel void @flat_nontemporal_store_0( ; ; GFX12-CU-LABEL: flat_nontemporal_store_0: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 @@ -582,13 +666,17 @@ define amdgpu_kernel void @flat_nontemporal_store_0( ; GFX1250-LABEL: flat_nontemporal_store_0: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm @@ -606,16 +694,20 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-NEXT: flat_load_dword v2, v[1:2] +; GFX7-NEXT: v_ashrrev_i32_e64 v3, 31, v0 +; GFX7-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_mov_b32 s4, 2 -; GFX7-NEXT: v_lshlrev_b32_e64 v3, s4, v0 -; GFX7-NEXT: v_mov_b32_e32 v0, 0 -; GFX7-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec -; GFX7-NEXT: v_mov_b32_e32 v4, v0 +; GFX7-NEXT: v_lshl_b64 v[3:4], v[0:1], s4 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_mov_b32 s6, s7 @@ -637,16 +729,20 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-WGP-NEXT: flat_load_dword v2, v[1:2] +; GFX10-WGP-NEXT: v_ashrrev_i32_e64 v3, 31, v0 +; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3 ; GFX10-WGP-NEXT: s_mov_b32 s4, 2 -; GFX10-WGP-NEXT: v_lshlrev_b32_e64 v3, s4, v0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec -; GFX10-WGP-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-WGP-NEXT: v_lshlrev_b64 v[3:4], s4, v[0:1] ; GFX10-WGP-NEXT: s_mov_b32 s5, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-WGP-NEXT: s_mov_b32 s4, s7 @@ -667,16 +763,20 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-CU-NEXT: flat_load_dword v2, v[1:2] +; GFX10-CU-NEXT: v_ashrrev_i32_e64 v3, 31, v0 +; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3 ; GFX10-CU-NEXT: s_mov_b32 s4, 2 -; GFX10-CU-NEXT: v_lshlrev_b32_e64 v3, s4, v0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec -; GFX10-CU-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-CU-NEXT: v_lshlrev_b64 v[3:4], s4, v[0:1] ; GFX10-CU-NEXT: s_mov_b32 s5, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-CU-NEXT: s_mov_b32 s4, s7 @@ -693,16 +793,20 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; SKIP-CACHE-INV-LABEL: flat_nontemporal_store_1: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[1:2] +; SKIP-CACHE-INV-NEXT: v_ashrrev_i32_e64 v3, 31, v0 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v3 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, 2 -; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e64 v3, s0, v0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, 0 -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v4, v0 +; SKIP-CACHE-INV-NEXT: v_lshl_b64 v[3:4], v[0:1], s0 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, v3 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s3 @@ -720,6 +824,10 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX11-WGP-LABEL: flat_nontemporal_store_1: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 @@ -727,11 +835,11 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX11-WGP-NEXT: flat_load_b32 v2, v[1:2] ; GFX11-WGP-NEXT: s_mov_b32 s0, 0x3ff ; GFX11-WGP-NEXT: v_and_b32_e64 v0, v0, s0 +; GFX11-WGP-NEXT: v_ashrrev_i32_e64 v3, 31, v0 +; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3 ; GFX11-WGP-NEXT: s_mov_b32 s0, 2 -; GFX11-WGP-NEXT: v_lshlrev_b32_e64 v3, s0, v0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec -; GFX11-WGP-NEXT: v_mov_b32_e32 v4, v0 +; GFX11-WGP-NEXT: v_lshlrev_b64 v[3:4], s0, v[0:1] ; GFX11-WGP-NEXT: s_mov_b32 s1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-WGP-NEXT: s_mov_b32 s0, s3 @@ -748,6 +856,10 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX11-CU-LABEL: flat_nontemporal_store_1: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 @@ -755,11 +867,11 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX11-CU-NEXT: flat_load_b32 v2, v[1:2] ; GFX11-CU-NEXT: s_mov_b32 s0, 0x3ff ; GFX11-CU-NEXT: v_and_b32_e64 v0, v0, s0 +; GFX11-CU-NEXT: v_ashrrev_i32_e64 v3, 31, v0 +; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3 ; GFX11-CU-NEXT: s_mov_b32 s0, 2 -; GFX11-CU-NEXT: v_lshlrev_b32_e64 v3, s0, v0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec -; GFX11-CU-NEXT: v_mov_b32_e32 v4, v0 +; GFX11-CU-NEXT: v_lshlrev_b64 v[3:4], s0, v[0:1] ; GFX11-CU-NEXT: s_mov_b32 s1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-CU-NEXT: s_mov_b32 s0, s3 @@ -776,6 +888,10 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX12-WGP-LABEL: flat_nontemporal_store_1: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 @@ -784,12 +900,12 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX12-WGP-NEXT: s_mov_b32 s0, 0x3ff ; GFX12-WGP-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-WGP-NEXT: v_and_b32_e64 v0, v0, s0 +; GFX12-WGP-NEXT: v_ashrrev_i32_e64 v3, 31, v0 +; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3 ; GFX12-WGP-NEXT: s_mov_b32 s0, 2 ; GFX12-WGP-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-WGP-NEXT: v_lshlrev_b32_e64 v3, s0, v0 -; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec -; GFX12-WGP-NEXT: v_mov_b32_e32 v4, v0 +; GFX12-WGP-NEXT: v_lshlrev_b64_e64 v[3:4], s0, v[0:1] ; GFX12-WGP-NEXT: s_mov_b32 s1, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-WGP-NEXT: s_mov_b32 s0, s3 @@ -812,6 +928,10 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX12-CU-LABEL: flat_nontemporal_store_1: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1 @@ -820,12 +940,12 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX12-CU-NEXT: s_mov_b32 s0, 0x3ff ; GFX12-CU-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-CU-NEXT: v_and_b32_e64 v0, v0, s0 +; GFX12-CU-NEXT: v_ashrrev_i32_e64 v3, 31, v0 +; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3 ; GFX12-CU-NEXT: s_mov_b32 s0, 2 ; GFX12-CU-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-CU-NEXT: v_lshlrev_b32_e64 v3, s0, v0 -; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec -; GFX12-CU-NEXT: v_mov_b32_e32 v4, v0 +; GFX12-CU-NEXT: v_lshlrev_b64_e64 v[3:4], s0, v[0:1] ; GFX12-CU-NEXT: s_mov_b32 s1, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-CU-NEXT: s_mov_b32 s0, s3 @@ -848,16 +968,20 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX1250-LABEL: flat_nontemporal_store_1: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: flat_load_b32 v1, v1, s[2:3] ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b32 s2, 0x3ff ; GFX1250-NEXT: v_and_b32_e64 v0, v0, s2 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scale_offset scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm @@ -876,9 +1000,12 @@ define amdgpu_kernel void @flat_volatile_workgroup_acquire_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] @@ -895,9 +1022,12 @@ define amdgpu_kernel void @flat_volatile_workgroup_acquire_load( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc @@ -914,9 +1044,12 @@ define amdgpu_kernel void @flat_volatile_workgroup_acquire_load( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] @@ -929,9 +1062,12 @@ define amdgpu_kernel void @flat_volatile_workgroup_acquire_load( ; ; SKIP-CACHE-INV-LABEL: flat_volatile_workgroup_acquire_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] @@ -944,9 +1080,12 @@ define amdgpu_kernel void @flat_volatile_workgroup_acquire_load( ; ; GFX11-WGP-LABEL: flat_volatile_workgroup_acquire_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc @@ -959,9 +1098,12 @@ define amdgpu_kernel void @flat_volatile_workgroup_acquire_load( ; ; GFX11-CU-LABEL: flat_volatile_workgroup_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] @@ -974,14 +1116,18 @@ define amdgpu_kernel void @flat_volatile_workgroup_acquire_load( ; ; GFX12-WGP-LABEL: flat_volatile_workgroup_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -989,13 +1135,17 @@ define amdgpu_kernel void @flat_volatile_workgroup_acquire_load( ; ; GFX12-CU-LABEL: flat_volatile_workgroup_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] ; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -1005,12 +1155,16 @@ define amdgpu_kernel void @flat_volatile_workgroup_acquire_load( ; GFX1250-LABEL: flat_volatile_workgroup_acquire_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { @@ -1027,6 +1181,10 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -1043,6 +1201,10 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 @@ -1060,6 +1222,10 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 @@ -1073,6 +1239,10 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store( ; SKIP-CACHE-INV-LABEL: flat_volatile_workgroup_release_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -1085,6 +1255,10 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store( ; GFX11-WGP-LABEL: flat_volatile_workgroup_release_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -1098,6 +1272,10 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store( ; GFX11-CU-LABEL: flat_volatile_workgroup_release_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -1111,6 +1289,10 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store( ; GFX12-WGP-LABEL: flat_volatile_workgroup_release_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -1126,6 +1308,10 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store( ; GFX12-CU-LABEL: flat_volatile_workgroup_release_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -1141,14 +1327,18 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store( ; GFX1250-LABEL: flat_volatile_workgroup_release_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll index 17d81d5ac587..62494945063e 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll @@ -19,15 +19,19 @@ define amdgpu_kernel void @flat_wavefront_unordered_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -37,15 +41,19 @@ define amdgpu_kernel void @flat_wavefront_unordered_load( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -55,29 +63,37 @@ define amdgpu_kernel void @flat_wavefront_unordered_load( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_unordered_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -85,13 +101,17 @@ define amdgpu_kernel void @flat_wavefront_unordered_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -99,11 +119,15 @@ define amdgpu_kernel void @flat_wavefront_unordered_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -111,23 +135,31 @@ define amdgpu_kernel void @flat_wavefront_unordered_load( ; ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_unordered_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_unordered_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -135,40 +167,52 @@ define amdgpu_kernel void @flat_wavefront_unordered_load( ; ; GFX11-WGP-LABEL: flat_wavefront_unordered_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_unordered_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_wavefront_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -177,12 +221,16 @@ define amdgpu_kernel void @flat_wavefront_unordered_load( ; ; GFX12-CU-LABEL: flat_wavefront_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -192,12 +240,16 @@ define amdgpu_kernel void @flat_wavefront_unordered_load( ; GFX1250-LABEL: flat_wavefront_unordered_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { @@ -213,15 +265,19 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -231,15 +287,19 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -249,29 +309,37 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_monotonic_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -279,13 +347,17 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -293,11 +365,15 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -305,23 +381,31 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load( ; ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_monotonic_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -329,40 +413,52 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load( ; ; GFX11-WGP-LABEL: flat_wavefront_monotonic_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_monotonic_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_wavefront_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -371,12 +467,16 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load( ; ; GFX12-CU-LABEL: flat_wavefront_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -386,12 +486,16 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load( ; GFX1250-LABEL: flat_wavefront_monotonic_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { @@ -407,15 +511,19 @@ define amdgpu_kernel void @flat_wavefront_acquire_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -425,15 +533,19 @@ define amdgpu_kernel void @flat_wavefront_acquire_load( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -443,29 +555,37 @@ define amdgpu_kernel void @flat_wavefront_acquire_load( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_acquire_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -473,13 +593,17 @@ define amdgpu_kernel void @flat_wavefront_acquire_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -487,11 +611,15 @@ define amdgpu_kernel void @flat_wavefront_acquire_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -499,23 +627,31 @@ define amdgpu_kernel void @flat_wavefront_acquire_load( ; ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_acquire_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_acquire_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -523,40 +659,52 @@ define amdgpu_kernel void @flat_wavefront_acquire_load( ; ; GFX11-WGP-LABEL: flat_wavefront_acquire_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_wavefront_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -565,12 +713,16 @@ define amdgpu_kernel void @flat_wavefront_acquire_load( ; ; GFX12-CU-LABEL: flat_wavefront_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -580,12 +732,16 @@ define amdgpu_kernel void @flat_wavefront_acquire_load( ; GFX1250-LABEL: flat_wavefront_acquire_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { @@ -601,15 +757,19 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -619,15 +779,19 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -637,29 +801,37 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_seq_cst_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -667,13 +839,17 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -681,11 +857,15 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -693,23 +873,31 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load( ; ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_seq_cst_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -717,40 +905,52 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load( ; ; GFX11-WGP-LABEL: flat_wavefront_seq_cst_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_seq_cst_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_wavefront_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -759,12 +959,16 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load( ; ; GFX12-CU-LABEL: flat_wavefront_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -774,12 +978,16 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load( ; GFX1250-LABEL: flat_wavefront_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { @@ -796,6 +1004,10 @@ define amdgpu_kernel void @flat_wavefront_unordered_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -811,6 +1023,10 @@ define amdgpu_kernel void @flat_wavefront_unordered_store( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 @@ -826,6 +1042,10 @@ define amdgpu_kernel void @flat_wavefront_unordered_store( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 @@ -837,6 +1057,10 @@ define amdgpu_kernel void @flat_wavefront_unordered_store( ; SKIP-CACHE-INV-LABEL: flat_wavefront_unordered_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -850,6 +1074,10 @@ define amdgpu_kernel void @flat_wavefront_unordered_store( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -862,6 +1090,10 @@ define amdgpu_kernel void @flat_wavefront_unordered_store( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -872,6 +1104,10 @@ define amdgpu_kernel void @flat_wavefront_unordered_store( ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_unordered_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -882,6 +1118,10 @@ define amdgpu_kernel void @flat_wavefront_unordered_store( ; GFX942-TGSPLIT-LABEL: flat_wavefront_unordered_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -892,6 +1132,10 @@ define amdgpu_kernel void @flat_wavefront_unordered_store( ; GFX11-WGP-LABEL: flat_wavefront_unordered_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -903,6 +1147,10 @@ define amdgpu_kernel void @flat_wavefront_unordered_store( ; GFX11-CU-LABEL: flat_wavefront_unordered_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -914,6 +1162,10 @@ define amdgpu_kernel void @flat_wavefront_unordered_store( ; GFX12-WGP-LABEL: flat_wavefront_unordered_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -925,6 +1177,10 @@ define amdgpu_kernel void @flat_wavefront_unordered_store( ; GFX12-CU-LABEL: flat_wavefront_unordered_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -936,12 +1192,16 @@ define amdgpu_kernel void @flat_wavefront_unordered_store( ; GFX1250-LABEL: flat_wavefront_unordered_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { @@ -957,6 +1217,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -972,6 +1236,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 @@ -987,6 +1255,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 @@ -998,6 +1270,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store( ; SKIP-CACHE-INV-LABEL: flat_wavefront_monotonic_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -1011,6 +1287,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -1023,6 +1303,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -1033,6 +1317,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store( ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -1043,6 +1331,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store( ; GFX942-TGSPLIT-LABEL: flat_wavefront_monotonic_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -1053,6 +1345,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store( ; GFX11-WGP-LABEL: flat_wavefront_monotonic_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -1064,6 +1360,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store( ; GFX11-CU-LABEL: flat_wavefront_monotonic_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -1075,6 +1375,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store( ; GFX12-WGP-LABEL: flat_wavefront_monotonic_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -1086,6 +1390,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store( ; GFX12-CU-LABEL: flat_wavefront_monotonic_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -1097,12 +1405,16 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store( ; GFX1250-LABEL: flat_wavefront_monotonic_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { @@ -1118,6 +1430,10 @@ define amdgpu_kernel void @flat_wavefront_release_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -1133,6 +1449,10 @@ define amdgpu_kernel void @flat_wavefront_release_store( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 @@ -1148,6 +1468,10 @@ define amdgpu_kernel void @flat_wavefront_release_store( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 @@ -1159,6 +1483,10 @@ define amdgpu_kernel void @flat_wavefront_release_store( ; SKIP-CACHE-INV-LABEL: flat_wavefront_release_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -1172,6 +1500,10 @@ define amdgpu_kernel void @flat_wavefront_release_store( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -1184,6 +1516,10 @@ define amdgpu_kernel void @flat_wavefront_release_store( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -1194,6 +1530,10 @@ define amdgpu_kernel void @flat_wavefront_release_store( ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_release_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -1204,6 +1544,10 @@ define amdgpu_kernel void @flat_wavefront_release_store( ; GFX942-TGSPLIT-LABEL: flat_wavefront_release_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -1214,6 +1558,10 @@ define amdgpu_kernel void @flat_wavefront_release_store( ; GFX11-WGP-LABEL: flat_wavefront_release_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -1225,6 +1573,10 @@ define amdgpu_kernel void @flat_wavefront_release_store( ; GFX11-CU-LABEL: flat_wavefront_release_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -1236,6 +1588,10 @@ define amdgpu_kernel void @flat_wavefront_release_store( ; GFX12-WGP-LABEL: flat_wavefront_release_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -1247,6 +1603,10 @@ define amdgpu_kernel void @flat_wavefront_release_store( ; GFX12-CU-LABEL: flat_wavefront_release_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -1258,12 +1618,16 @@ define amdgpu_kernel void @flat_wavefront_release_store( ; GFX1250-LABEL: flat_wavefront_release_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { @@ -1279,6 +1643,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -1294,6 +1662,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 @@ -1309,6 +1681,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 @@ -1320,6 +1696,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store( ; SKIP-CACHE-INV-LABEL: flat_wavefront_seq_cst_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -1333,6 +1713,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -1345,6 +1729,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -1355,6 +1743,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store( ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -1365,6 +1757,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store( ; GFX942-TGSPLIT-LABEL: flat_wavefront_seq_cst_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -1375,6 +1771,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store( ; GFX11-WGP-LABEL: flat_wavefront_seq_cst_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -1386,6 +1786,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store( ; GFX11-CU-LABEL: flat_wavefront_seq_cst_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -1397,6 +1801,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store( ; GFX12-WGP-LABEL: flat_wavefront_seq_cst_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -1408,6 +1816,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store( ; GFX12-CU-LABEL: flat_wavefront_seq_cst_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -1419,12 +1831,16 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store( ; GFX1250-LABEL: flat_wavefront_seq_cst_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { @@ -1439,11 +1855,15 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm @@ -1454,11 +1874,15 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm @@ -1469,22 +1893,30 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_monotonic_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -1493,10 +1925,14 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -1505,74 +1941,102 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_monotonic_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_wavefront_monotonic_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_monotonic_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_wavefront_monotonic_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_wavefront_monotonic_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm @@ -1580,7 +2044,11 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw( ; GFX1250-LABEL: flat_wavefront_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -1600,11 +2068,15 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm @@ -1615,11 +2087,15 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm @@ -1630,22 +2106,30 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_acquire_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -1654,10 +2138,14 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -1666,74 +2154,102 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_acquire_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_acquire_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_wavefront_acquire_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_acquire_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_wavefront_acquire_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_wavefront_acquire_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm @@ -1741,7 +2257,11 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw( ; GFX1250-LABEL: flat_wavefront_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -1761,11 +2281,15 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm @@ -1776,11 +2300,15 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm @@ -1791,22 +2319,30 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_release_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -1815,10 +2351,14 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -1827,74 +2367,102 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_release_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_release_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_wavefront_release_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_release_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_wavefront_release_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_wavefront_release_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm @@ -1902,7 +2470,11 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw( ; GFX1250-LABEL: flat_wavefront_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -1922,11 +2494,15 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm @@ -1937,11 +2513,15 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm @@ -1952,22 +2532,30 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_acq_rel_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -1976,10 +2564,14 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -1988,74 +2580,102 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_acq_rel_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_wavefront_acq_rel_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_acq_rel_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_wavefront_acq_rel_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_wavefront_acq_rel_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm @@ -2063,7 +2683,11 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw( ; GFX1250-LABEL: flat_wavefront_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2083,11 +2707,15 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm @@ -2098,11 +2726,15 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm @@ -2113,22 +2745,30 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_seq_cst_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -2137,10 +2777,14 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -2149,74 +2793,102 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_seq_cst_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_wavefront_seq_cst_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_seq_cst_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_wavefront_seq_cst_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_wavefront_seq_cst_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm @@ -2224,7 +2896,11 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw( ; GFX1250-LABEL: flat_wavefront_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2245,6 +2921,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -2264,6 +2944,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -2283,6 +2967,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -2298,6 +2986,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: flat_wavefront_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -2315,6 +3007,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] @@ -2330,6 +3026,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] @@ -2343,6 +3043,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_acquire_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -2356,6 +3060,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: flat_wavefront_acquire_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -2369,6 +3077,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw( ; GFX11-WGP-LABEL: flat_wavefront_acquire_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -2384,6 +3096,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw( ; GFX11-CU-LABEL: flat_wavefront_acquire_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2399,6 +3115,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw( ; GFX12-WGP-LABEL: flat_wavefront_acquire_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -2414,6 +3134,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw( ; GFX12-CU-LABEL: flat_wavefront_acquire_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2429,7 +3153,11 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw( ; GFX1250-LABEL: flat_wavefront_acquire_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2453,6 +3181,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -2472,6 +3204,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -2491,6 +3227,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -2506,6 +3246,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: flat_wavefront_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -2523,6 +3267,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] @@ -2538,6 +3286,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] @@ -2551,6 +3303,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -2564,6 +3320,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: flat_wavefront_acq_rel_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -2577,6 +3337,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw( ; GFX11-WGP-LABEL: flat_wavefront_acq_rel_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -2592,6 +3356,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw( ; GFX11-CU-LABEL: flat_wavefront_acq_rel_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2607,6 +3375,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw( ; GFX12-WGP-LABEL: flat_wavefront_acq_rel_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -2622,6 +3394,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw( ; GFX12-CU-LABEL: flat_wavefront_acq_rel_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2637,7 +3413,11 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw( ; GFX1250-LABEL: flat_wavefront_acq_rel_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2661,6 +3441,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -2680,6 +3464,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -2699,6 +3487,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -2714,6 +3506,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: flat_wavefront_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -2731,6 +3527,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] @@ -2746,6 +3546,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] @@ -2759,6 +3563,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -2772,6 +3580,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: flat_wavefront_seq_cst_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -2785,6 +3597,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw( ; GFX11-WGP-LABEL: flat_wavefront_seq_cst_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -2800,6 +3616,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw( ; GFX11-CU-LABEL: flat_wavefront_seq_cst_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2815,6 +3635,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw( ; GFX12-WGP-LABEL: flat_wavefront_seq_cst_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -2830,6 +3654,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw( ; GFX12-CU-LABEL: flat_wavefront_seq_cst_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2845,7 +3673,11 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw( ; GFX1250-LABEL: flat_wavefront_seq_cst_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2869,11 +3701,16 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -2883,6 +3720,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -2898,11 +3736,16 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -2912,6 +3755,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -2927,11 +3771,16 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -2941,6 +3790,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -2952,11 +3802,16 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -2966,6 +3821,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -2979,6 +3835,12 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2995,6 +3857,12 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3009,6 +3877,12 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3023,6 +3897,12 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3037,6 +3917,12 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg( ; GFX11-WGP-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -3052,6 +3938,12 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg( ; GFX11-CU-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -3067,6 +3959,12 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -3082,6 +3980,12 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -3097,7 +4001,13 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg( ; GFX1250-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -3123,11 +4033,16 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -3137,6 +4052,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -3152,11 +4068,16 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -3166,6 +4087,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -3181,11 +4103,16 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -3195,6 +4122,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -3206,11 +4134,16 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -3220,6 +4153,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -3233,6 +4167,12 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3249,6 +4189,12 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3263,6 +4209,12 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3277,6 +4229,12 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3291,6 +4249,12 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg( ; GFX11-WGP-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -3306,6 +4270,12 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg( ; GFX11-CU-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -3321,6 +4291,12 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -3336,6 +4312,12 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -3351,7 +4333,13 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg( ; GFX1250-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -3377,11 +4365,16 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -3391,6 +4384,7 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -3406,11 +4400,16 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -3420,6 +4419,7 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -3435,11 +4435,16 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -3449,6 +4454,7 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -3460,11 +4466,16 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_wavefront_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -3474,6 +4485,7 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -3487,6 +4499,12 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3503,6 +4521,12 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3517,6 +4541,12 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_release_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3531,6 +4561,12 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_wavefront_release_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3545,6 +4581,12 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg( ; GFX11-WGP-LABEL: flat_wavefront_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -3560,6 +4602,12 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg( ; GFX11-CU-LABEL: flat_wavefront_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -3575,6 +4623,12 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -3590,6 +4644,12 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -3605,7 +4665,13 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg( ; GFX1250-LABEL: flat_wavefront_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -3631,11 +4697,16 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -3645,6 +4716,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -3660,11 +4732,16 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -3674,6 +4751,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -3689,11 +4767,16 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -3703,6 +4786,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -3714,11 +4798,16 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -3728,6 +4817,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -3741,6 +4831,12 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3757,6 +4853,12 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3771,6 +4873,12 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3785,6 +4893,12 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3799,6 +4913,12 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg( ; GFX11-WGP-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -3814,6 +4934,12 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg( ; GFX11-CU-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -3829,6 +4955,12 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -3844,6 +4976,12 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -3859,7 +4997,13 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg( ; GFX1250-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -3885,11 +5029,16 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -3899,6 +5048,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -3914,11 +5064,16 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -3928,6 +5083,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -3943,11 +5099,16 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -3957,6 +5118,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -3968,11 +5130,16 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -3982,6 +5149,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -3995,6 +5163,12 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4011,6 +5185,12 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4025,6 +5205,12 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4039,6 +5225,12 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4053,6 +5245,12 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg( ; GFX11-WGP-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4068,6 +5266,12 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg( ; GFX11-CU-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4083,6 +5287,12 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -4098,6 +5308,12 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -4113,7 +5329,13 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg( ; GFX1250-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -4139,11 +5361,16 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -4153,6 +5380,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -4168,11 +5396,16 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -4182,6 +5415,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -4197,11 +5431,16 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -4211,6 +5450,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -4222,11 +5462,16 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -4236,6 +5481,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -4249,6 +5495,12 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4265,6 +5517,12 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4279,6 +5537,12 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4293,6 +5557,12 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4307,6 +5577,12 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg( ; GFX11-WGP-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4322,6 +5598,12 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg( ; GFX11-CU-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4337,6 +5619,12 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -4352,6 +5640,12 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -4367,7 +5661,13 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg( ; GFX1250-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -4393,11 +5693,16 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -4407,6 +5712,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -4422,11 +5728,16 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -4436,6 +5747,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -4451,11 +5763,16 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -4465,6 +5782,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -4476,11 +5794,16 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_wavefront_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -4490,6 +5813,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -4503,6 +5827,12 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4519,6 +5849,12 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4533,6 +5869,12 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_acquire_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4547,6 +5889,12 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_wavefront_acquire_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4561,6 +5909,12 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg( ; GFX11-WGP-LABEL: flat_wavefront_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4576,6 +5930,12 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg( ; GFX11-CU-LABEL: flat_wavefront_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4591,6 +5951,12 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -4606,6 +5972,12 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -4621,7 +5993,13 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg( ; GFX1250-LABEL: flat_wavefront_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -4647,11 +6025,16 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -4661,6 +6044,7 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -4676,11 +6060,16 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -4690,6 +6079,7 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -4705,11 +6095,16 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -4719,6 +6114,7 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -4730,11 +6126,16 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_wavefront_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -4744,6 +6145,7 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -4757,6 +6159,12 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4773,6 +6181,12 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4787,6 +6201,12 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_release_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4801,6 +6221,12 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_wavefront_release_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4815,6 +6241,12 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg( ; GFX11-WGP-LABEL: flat_wavefront_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4830,6 +6262,12 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg( ; GFX11-CU-LABEL: flat_wavefront_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4845,6 +6283,12 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -4860,6 +6304,12 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -4875,7 +6325,13 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg( ; GFX1250-LABEL: flat_wavefront_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -4901,11 +6357,16 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -4915,6 +6376,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -4930,11 +6392,16 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -4944,6 +6411,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -4959,11 +6427,16 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -4973,6 +6446,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -4984,11 +6458,16 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -4998,6 +6477,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -5011,6 +6491,12 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5027,6 +6513,12 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5041,6 +6533,12 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5055,6 +6553,12 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5069,6 +6573,12 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg( ; GFX11-WGP-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5084,6 +6594,12 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg( ; GFX11-CU-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5099,6 +6615,12 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -5114,6 +6636,12 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -5129,7 +6657,13 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg( ; GFX1250-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -5155,11 +6689,16 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -5169,6 +6708,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -5184,11 +6724,16 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -5198,6 +6743,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -5213,11 +6759,16 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -5227,6 +6778,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -5238,11 +6790,16 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -5252,6 +6809,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -5265,6 +6823,12 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5281,6 +6845,12 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5295,6 +6865,12 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5309,6 +6885,12 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5323,6 +6905,12 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg( ; GFX11-WGP-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5338,6 +6926,12 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg( ; GFX11-CU-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5353,6 +6947,12 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -5368,6 +6968,12 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -5383,7 +6989,13 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg( ; GFX1250-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -5409,11 +7021,16 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -5423,6 +7040,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -5438,11 +7056,16 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -5452,6 +7075,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -5467,11 +7091,16 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -5481,6 +7110,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -5492,11 +7122,16 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -5506,6 +7141,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -5519,6 +7155,12 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5535,6 +7177,12 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5549,6 +7197,12 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5563,6 +7217,12 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5577,6 +7237,12 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5592,6 +7258,12 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg( ; GFX11-CU-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5607,6 +7279,12 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -5622,6 +7300,12 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -5637,7 +7321,13 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg( ; GFX1250-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -5663,11 +7353,16 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -5677,6 +7372,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -5692,11 +7388,16 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -5706,6 +7407,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -5721,11 +7423,16 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -5735,6 +7442,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -5746,11 +7454,16 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -5760,6 +7473,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -5773,6 +7487,12 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5789,6 +7509,12 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5803,6 +7529,12 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5817,6 +7549,12 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5831,6 +7569,12 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5846,6 +7590,12 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg( ; GFX11-CU-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5861,6 +7611,12 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -5876,6 +7632,12 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -5891,7 +7653,13 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg( ; GFX1250-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -5917,11 +7685,16 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -5931,6 +7704,7 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -5946,11 +7720,16 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -5960,6 +7739,7 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -5975,11 +7755,16 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -5989,6 +7774,7 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -6000,11 +7786,16 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_wavefront_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -6014,6 +7805,7 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -6027,6 +7819,12 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6043,6 +7841,12 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6057,6 +7861,12 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_release_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6071,6 +7881,12 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_wavefront_release_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6085,6 +7901,12 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: flat_wavefront_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -6100,6 +7922,12 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg( ; GFX11-CU-LABEL: flat_wavefront_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6115,6 +7943,12 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -6130,6 +7964,12 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -6145,7 +7985,13 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg( ; GFX1250-LABEL: flat_wavefront_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -6171,11 +8017,16 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -6185,6 +8036,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -6200,11 +8052,16 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -6214,6 +8071,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -6229,11 +8087,16 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -6243,6 +8106,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -6254,11 +8118,16 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -6268,6 +8137,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -6281,6 +8151,12 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6297,6 +8173,12 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6311,6 +8193,12 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6325,6 +8213,12 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6339,6 +8233,12 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -6354,6 +8254,12 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX11-CU-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6369,6 +8275,12 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -6384,6 +8296,12 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -6399,7 +8317,13 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX1250-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -6425,11 +8349,16 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -6439,6 +8368,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -6454,11 +8384,16 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -6468,6 +8403,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -6483,11 +8419,16 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -6497,6 +8438,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -6508,11 +8450,16 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -6522,6 +8469,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -6535,6 +8483,12 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6551,6 +8505,12 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6565,6 +8525,12 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6579,6 +8545,12 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6593,6 +8565,12 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -6608,6 +8586,12 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX11-CU-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6623,6 +8607,12 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -6638,6 +8628,12 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -6653,7 +8649,13 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX1250-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -6680,6 +8682,12 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -6713,6 +8721,12 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -6746,6 +8760,12 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -6775,6 +8795,12 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -6805,6 +8831,12 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6824,6 +8856,12 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6841,6 +8879,12 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6858,6 +8902,12 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6875,6 +8925,12 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -6894,6 +8950,12 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6913,6 +8975,12 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -6932,6 +9000,12 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -6951,7 +9025,13 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg( ; GFX1250-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -6982,6 +9062,12 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -7015,6 +9101,12 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -7048,6 +9140,12 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -7077,6 +9175,12 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -7107,6 +9211,12 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7126,6 +9236,12 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7143,6 +9259,12 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7160,6 +9282,12 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7177,6 +9305,12 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7196,6 +9330,12 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -7215,6 +9355,12 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -7234,6 +9380,12 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -7253,7 +9405,13 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX1250-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -7284,6 +9442,12 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -7317,6 +9481,12 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -7350,6 +9520,12 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -7379,6 +9555,12 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -7409,6 +9591,12 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7428,6 +9616,12 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7445,6 +9639,12 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7462,6 +9662,12 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7479,6 +9685,12 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7498,6 +9710,12 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -7517,6 +9735,12 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -7536,6 +9760,12 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -7555,7 +9785,13 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg( ; GFX1250-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -7586,6 +9822,12 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -7619,6 +9861,12 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -7652,6 +9900,12 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -7681,6 +9935,12 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -7711,6 +9971,12 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7730,6 +9996,12 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7747,6 +10019,12 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7764,6 +10042,12 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7781,6 +10065,12 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7800,6 +10090,12 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -7819,6 +10115,12 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -7838,6 +10140,12 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -7857,7 +10165,13 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX1250-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -7888,6 +10202,12 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -7921,6 +10241,12 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -7954,6 +10280,12 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -7983,6 +10315,12 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -8013,6 +10351,12 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8032,6 +10376,12 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8049,6 +10399,12 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8066,6 +10422,12 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8083,6 +10445,12 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -8102,6 +10470,12 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -8121,6 +10495,12 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -8140,6 +10520,12 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -8159,7 +10545,13 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX1250-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -8190,6 +10582,12 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -8223,6 +10621,12 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -8256,6 +10660,12 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -8285,6 +10695,12 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -8315,6 +10731,12 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8334,6 +10756,12 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8351,6 +10779,12 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8368,6 +10802,12 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8385,6 +10825,12 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -8404,6 +10850,12 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -8423,6 +10875,12 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -8442,6 +10900,12 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -8461,7 +10925,13 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX1250-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -8492,6 +10962,12 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -8525,6 +11001,12 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -8558,6 +11040,12 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -8587,6 +11075,12 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -8617,6 +11111,12 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8636,6 +11136,12 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8653,6 +11159,12 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8670,6 +11182,12 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8687,6 +11205,12 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -8706,6 +11230,12 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -8725,6 +11255,12 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -8744,6 +11280,12 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -8763,7 +11305,13 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg( ; GFX1250-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -8794,6 +11342,12 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -8827,6 +11381,12 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -8860,6 +11420,12 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -8889,6 +11455,12 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -8919,6 +11491,12 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8938,6 +11516,12 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8955,6 +11539,12 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8972,6 +11562,12 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8989,6 +11585,12 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -9008,6 +11610,12 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -9027,6 +11635,12 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -9046,6 +11660,12 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -9065,7 +11685,13 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg( ; GFX1250-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -9096,6 +11722,12 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -9129,6 +11761,12 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -9162,6 +11800,12 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -9191,6 +11835,12 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -9221,6 +11871,12 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9240,6 +11896,12 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9257,6 +11919,12 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9274,6 +11942,12 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9291,6 +11965,12 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -9310,6 +11990,12 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -9329,6 +12015,12 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -9348,6 +12040,12 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -9367,7 +12065,13 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX1250-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -9398,6 +12102,12 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -9431,6 +12141,12 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -9464,6 +12180,12 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -9493,6 +12215,12 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -9523,6 +12251,12 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9542,6 +12276,12 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9559,6 +12299,12 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9576,6 +12322,12 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9593,6 +12345,12 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -9612,6 +12370,12 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -9631,6 +12395,12 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -9650,6 +12420,12 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -9669,7 +12445,13 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX1250-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -9700,6 +12482,12 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -9733,6 +12521,12 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -9766,6 +12560,12 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -9795,6 +12595,12 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -9825,6 +12631,12 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9844,6 +12656,12 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9861,6 +12679,12 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9878,6 +12702,12 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9895,6 +12725,12 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -9914,6 +12750,12 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -9933,6 +12775,12 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -9952,6 +12800,12 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -9971,7 +12825,13 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -10002,6 +12862,12 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -10035,6 +12901,12 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -10068,6 +12940,12 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -10097,6 +12975,12 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -10127,6 +13011,12 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10146,6 +13036,12 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10163,6 +13059,12 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10180,6 +13082,12 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10197,6 +13105,12 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10216,6 +13130,12 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10235,6 +13155,12 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -10254,6 +13180,12 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -10273,7 +13205,13 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -10304,6 +13242,12 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -10337,6 +13281,12 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -10370,6 +13320,12 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -10399,6 +13355,12 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -10429,6 +13391,12 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10448,6 +13416,12 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10465,6 +13439,12 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10482,6 +13462,12 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10499,6 +13485,12 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10518,6 +13510,12 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10537,6 +13535,12 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -10556,6 +13560,12 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -10575,7 +13585,13 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -10606,6 +13622,12 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -10639,6 +13661,12 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -10672,6 +13700,12 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -10701,6 +13735,12 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -10731,6 +13771,12 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10750,6 +13796,12 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10767,6 +13819,12 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10784,6 +13842,12 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10801,6 +13865,12 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10820,6 +13890,12 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10839,6 +13915,12 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -10858,6 +13940,12 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -10877,7 +13965,13 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -10908,6 +14002,12 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -10941,6 +14041,12 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -10974,6 +14080,12 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -11003,6 +14115,12 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -11033,6 +14151,12 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11052,6 +14176,12 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11069,6 +14199,12 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11086,6 +14222,12 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11103,6 +14245,12 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11122,6 +14270,12 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11141,6 +14295,12 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -11160,6 +14320,12 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -11179,7 +14345,13 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -11208,15 +14380,19 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -11226,15 +14402,19 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -11244,29 +14424,37 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_unordered_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -11274,13 +14462,17 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11288,11 +14480,15 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -11300,23 +14496,31 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load( ; ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_one_as_unordered_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_unordered_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -11324,40 +14528,52 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load( ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_unordered_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_one_as_unordered_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_wavefront_one_as_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -11366,12 +14582,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load( ; ; GFX12-CU-LABEL: flat_wavefront_one_as_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -11381,12 +14601,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load( ; GFX1250-LABEL: flat_wavefront_one_as_unordered_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { @@ -11402,15 +14626,19 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -11420,15 +14648,19 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -11438,29 +14670,37 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_monotonic_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -11468,13 +14708,17 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11482,11 +14726,15 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -11494,23 +14742,31 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load( ; ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -11518,40 +14774,52 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load( ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_monotonic_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_one_as_monotonic_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_wavefront_one_as_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -11560,12 +14828,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load( ; ; GFX12-CU-LABEL: flat_wavefront_one_as_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -11575,12 +14847,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load( ; GFX1250-LABEL: flat_wavefront_one_as_monotonic_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { @@ -11596,15 +14872,19 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -11614,15 +14894,19 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -11632,29 +14916,37 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acquire_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -11662,13 +14954,17 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11676,11 +14972,15 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -11688,23 +14988,31 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load( ; ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -11712,40 +15020,52 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load( ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_acquire_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_one_as_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_wavefront_one_as_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -11754,12 +15074,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load( ; ; GFX12-CU-LABEL: flat_wavefront_one_as_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -11769,12 +15093,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load( ; GFX1250-LABEL: flat_wavefront_one_as_acquire_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { @@ -11790,15 +15118,19 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -11808,15 +15140,19 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -11826,29 +15162,37 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -11856,13 +15200,17 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11870,11 +15218,15 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -11882,23 +15234,31 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load( ; ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -11906,40 +15266,52 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load( ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_seq_cst_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_one_as_seq_cst_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_wavefront_one_as_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -11948,12 +15320,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load( ; ; GFX12-CU-LABEL: flat_wavefront_one_as_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -11963,12 +15339,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load( ; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { @@ -11985,6 +15365,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -12000,6 +15384,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 @@ -12015,6 +15403,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 @@ -12026,6 +15418,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store( ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_unordered_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -12039,6 +15435,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -12051,6 +15451,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -12061,6 +15465,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store( ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_one_as_unordered_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -12071,6 +15479,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store( ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_unordered_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -12081,6 +15493,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store( ; GFX11-WGP-LABEL: flat_wavefront_one_as_unordered_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -12092,6 +15508,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store( ; GFX11-CU-LABEL: flat_wavefront_one_as_unordered_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -12103,6 +15523,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store( ; GFX12-WGP-LABEL: flat_wavefront_one_as_unordered_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -12114,6 +15538,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store( ; GFX12-CU-LABEL: flat_wavefront_one_as_unordered_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -12125,12 +15553,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store( ; GFX1250-LABEL: flat_wavefront_one_as_unordered_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { @@ -12146,6 +15578,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -12161,6 +15597,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 @@ -12176,6 +15616,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 @@ -12187,6 +15631,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store( ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_monotonic_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -12200,6 +15648,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -12212,6 +15664,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -12222,6 +15678,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store( ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -12232,6 +15692,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store( ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -12242,6 +15706,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store( ; GFX11-WGP-LABEL: flat_wavefront_one_as_monotonic_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -12253,6 +15721,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store( ; GFX11-CU-LABEL: flat_wavefront_one_as_monotonic_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -12264,6 +15736,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store( ; GFX12-WGP-LABEL: flat_wavefront_one_as_monotonic_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -12275,6 +15751,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store( ; GFX12-CU-LABEL: flat_wavefront_one_as_monotonic_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -12286,12 +15766,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store( ; GFX1250-LABEL: flat_wavefront_one_as_monotonic_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { @@ -12307,6 +15791,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -12322,6 +15810,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 @@ -12337,6 +15829,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 @@ -12348,6 +15844,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store( ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_release_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -12361,6 +15861,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -12373,6 +15877,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -12383,6 +15891,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store( ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -12393,6 +15905,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store( ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_release_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -12403,6 +15919,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store( ; GFX11-WGP-LABEL: flat_wavefront_one_as_release_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -12414,6 +15934,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store( ; GFX11-CU-LABEL: flat_wavefront_one_as_release_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -12425,6 +15949,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store( ; GFX12-WGP-LABEL: flat_wavefront_one_as_release_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -12436,6 +15964,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store( ; GFX12-CU-LABEL: flat_wavefront_one_as_release_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -12447,12 +15979,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store( ; GFX1250-LABEL: flat_wavefront_one_as_release_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { @@ -12468,6 +16004,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -12483,6 +16023,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 @@ -12498,6 +16042,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 @@ -12509,6 +16057,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store( ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -12522,6 +16074,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -12534,6 +16090,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -12544,6 +16104,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store( ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -12554,6 +16118,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store( ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -12564,6 +16132,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store( ; GFX11-WGP-LABEL: flat_wavefront_one_as_seq_cst_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -12575,6 +16147,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store( ; GFX11-CU-LABEL: flat_wavefront_one_as_seq_cst_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -12586,6 +16162,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store( ; GFX12-WGP-LABEL: flat_wavefront_one_as_seq_cst_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -12597,6 +16177,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store( ; GFX12-CU-LABEL: flat_wavefront_one_as_seq_cst_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -12608,12 +16192,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store( ; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { @@ -12628,11 +16216,15 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm @@ -12643,11 +16235,15 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm @@ -12658,22 +16254,30 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_monotonic_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -12682,10 +16286,14 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -12694,74 +16302,102 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_monotonic_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_one_as_monotonic_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_wavefront_one_as_monotonic_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_wavefront_one_as_monotonic_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm @@ -12769,7 +16405,11 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw( ; GFX1250-LABEL: flat_wavefront_one_as_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -12789,11 +16429,15 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm @@ -12804,11 +16448,15 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm @@ -12819,22 +16467,30 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acquire_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -12843,10 +16499,14 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -12855,74 +16515,102 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_acquire_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_one_as_acquire_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_wavefront_one_as_acquire_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_wavefront_one_as_acquire_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm @@ -12930,7 +16618,11 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw( ; GFX1250-LABEL: flat_wavefront_one_as_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -12950,11 +16642,15 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm @@ -12965,11 +16661,15 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm @@ -12980,22 +16680,30 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_release_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -13004,10 +16712,14 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -13016,74 +16728,102 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_release_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_release_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_one_as_release_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_wavefront_one_as_release_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_wavefront_one_as_release_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm @@ -13091,7 +16831,11 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw( ; GFX1250-LABEL: flat_wavefront_one_as_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -13111,11 +16855,15 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm @@ -13126,11 +16874,15 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm @@ -13141,22 +16893,30 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -13165,10 +16925,14 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -13177,74 +16941,102 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm @@ -13252,7 +17044,11 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw( ; GFX1250-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -13272,11 +17068,15 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm @@ -13287,11 +17087,15 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm @@ -13302,22 +17106,30 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -13326,10 +17138,14 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -13338,74 +17154,102 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm @@ -13413,7 +17257,11 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw( ; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -13434,6 +17282,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -13453,6 +17305,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -13472,6 +17328,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -13487,6 +17347,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -13504,6 +17368,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] @@ -13519,6 +17387,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] @@ -13532,6 +17404,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -13545,6 +17421,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -13558,6 +17438,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw( ; GFX11-WGP-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -13573,6 +17457,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw( ; GFX11-CU-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -13588,6 +17476,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw( ; GFX12-WGP-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -13603,6 +17495,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw( ; GFX12-CU-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -13618,7 +17514,11 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw( ; GFX1250-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -13642,6 +17542,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -13661,6 +17565,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -13680,6 +17588,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -13695,6 +17607,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -13712,6 +17628,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] @@ -13727,6 +17647,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] @@ -13740,6 +17664,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -13753,6 +17681,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -13766,6 +17698,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX11-WGP-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -13781,6 +17717,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX11-CU-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -13796,6 +17736,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX12-WGP-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -13811,6 +17755,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -13826,7 +17774,11 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX1250-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -13850,6 +17802,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -13869,6 +17825,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -13888,6 +17848,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -13903,6 +17867,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -13920,6 +17888,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] @@ -13935,6 +17907,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] @@ -13948,6 +17924,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -13961,6 +17941,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -13974,6 +17958,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX11-WGP-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -13989,6 +17977,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX11-CU-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -14004,6 +17996,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX12-WGP-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -14019,6 +18015,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -14034,7 +18034,11 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -14058,11 +18062,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -14072,6 +18081,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -14087,11 +18097,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -14101,6 +18116,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -14116,11 +18132,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -14130,6 +18151,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -14141,11 +18163,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -14155,6 +18182,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -14168,6 +18196,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14184,6 +18218,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14198,6 +18238,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14212,6 +18258,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14226,6 +18278,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg( ; GFX11-WGP-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -14241,6 +18299,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg( ; GFX11-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -14256,6 +18320,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -14271,6 +18341,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -14286,7 +18362,13 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg( ; GFX1250-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -14312,11 +18394,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -14326,6 +18413,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -14341,11 +18429,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -14355,6 +18448,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -14370,11 +18464,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -14384,6 +18483,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -14395,11 +18495,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -14409,6 +18514,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -14422,6 +18528,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14438,6 +18550,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14452,6 +18570,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14466,6 +18590,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14480,6 +18610,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX11-WGP-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -14495,6 +18631,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX11-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -14510,6 +18652,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -14525,6 +18673,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -14540,7 +18694,13 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX1250-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -14566,11 +18726,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -14580,6 +18745,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -14595,11 +18761,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -14609,6 +18780,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -14624,11 +18796,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -14638,6 +18815,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -14649,11 +18827,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -14663,6 +18846,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -14676,6 +18860,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14692,6 +18882,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14706,6 +18902,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14720,6 +18922,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14734,6 +18942,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg( ; GFX11-WGP-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -14749,6 +18963,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg( ; GFX11-CU-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -14764,6 +18984,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -14779,6 +19005,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -14794,7 +19026,13 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg( ; GFX1250-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -14820,11 +19058,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -14834,6 +19077,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -14849,11 +19093,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -14863,6 +19112,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -14878,11 +19128,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -14892,6 +19147,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -14903,11 +19159,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -14917,6 +19178,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -14930,6 +19192,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14946,6 +19214,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14960,6 +19234,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14974,6 +19254,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14988,6 +19274,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX11-WGP-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -15003,6 +19295,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX11-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -15018,6 +19316,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -15033,6 +19337,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -15048,7 +19358,13 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX1250-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -15074,11 +19390,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -15088,6 +19409,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -15103,11 +19425,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -15117,6 +19444,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -15132,11 +19460,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -15146,6 +19479,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -15157,11 +19491,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -15171,6 +19510,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -15184,6 +19524,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15200,6 +19546,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15214,6 +19566,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15228,6 +19586,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15242,6 +19606,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX11-WGP-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -15257,6 +19627,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX11-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -15272,6 +19648,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -15287,6 +19669,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -15302,7 +19690,13 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -15328,11 +19722,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -15342,6 +19741,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -15357,11 +19757,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -15371,6 +19776,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -15386,11 +19792,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -15400,6 +19811,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -15411,11 +19823,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -15425,6 +19842,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -15438,6 +19856,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15454,6 +19878,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15468,6 +19898,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15482,6 +19918,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15496,6 +19938,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX11-WGP-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -15511,6 +19959,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX11-CU-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -15526,6 +19980,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -15541,6 +20001,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -15556,7 +20022,13 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX1250-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -15582,11 +20054,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -15596,6 +20073,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -15611,11 +20089,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -15625,6 +20108,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -15640,11 +20124,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -15654,6 +20143,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -15665,11 +20155,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -15679,6 +20174,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -15692,6 +20188,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15708,6 +20210,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15722,6 +20230,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15736,6 +20250,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15750,6 +20270,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX11-WGP-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -15765,6 +20291,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX11-CU-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -15780,6 +20312,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -15795,6 +20333,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -15810,7 +20354,13 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX1250-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -15836,11 +20386,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -15850,6 +20405,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -15865,11 +20421,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -15879,6 +20440,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -15894,11 +20456,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -15908,6 +20475,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -15919,11 +20487,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -15933,6 +20506,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -15946,6 +20520,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15962,6 +20542,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15976,6 +20562,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15990,6 +20582,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16004,6 +20602,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg( ; GFX11-WGP-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16019,6 +20623,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg( ; GFX11-CU-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16034,6 +20644,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -16049,6 +20665,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -16064,7 +20686,13 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg( ; GFX1250-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -16090,11 +20718,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -16104,6 +20737,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -16119,11 +20753,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -16133,6 +20772,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -16148,11 +20788,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -16162,6 +20807,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -16173,11 +20819,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -16187,6 +20838,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -16200,6 +20852,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16216,6 +20874,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16230,6 +20894,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16244,6 +20914,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16258,6 +20934,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX11-WGP-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16273,6 +20955,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX11-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16288,6 +20976,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -16303,6 +20997,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -16318,7 +21018,13 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX1250-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -16344,11 +21050,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -16358,6 +21069,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -16373,11 +21085,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -16387,6 +21104,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -16402,11 +21120,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -16416,6 +21139,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -16427,11 +21151,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -16441,6 +21170,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -16454,6 +21184,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16470,6 +21206,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16484,6 +21226,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16498,6 +21246,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16512,6 +21266,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX11-WGP-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16527,6 +21287,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX11-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16542,6 +21308,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -16557,6 +21329,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -16572,7 +21350,13 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -16598,11 +21382,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -16612,6 +21401,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -16627,11 +21417,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -16641,6 +21436,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -16656,11 +21452,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -16670,6 +21471,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -16681,11 +21483,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -16695,6 +21502,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -16708,6 +21516,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16724,6 +21538,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16738,6 +21558,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16752,6 +21578,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16766,6 +21598,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16781,6 +21619,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX11-CU-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16796,6 +21640,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -16811,6 +21661,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -16826,7 +21682,13 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX1250-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -16852,11 +21714,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -16866,6 +21733,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -16881,11 +21749,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -16895,6 +21768,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -16910,11 +21784,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -16924,6 +21803,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -16935,11 +21815,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -16949,6 +21834,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -16962,6 +21848,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16978,6 +21870,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16992,6 +21890,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17006,6 +21910,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17020,6 +21930,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17035,6 +21951,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX11-CU-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17050,6 +21972,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -17065,6 +21993,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -17080,7 +22014,13 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX1250-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -17106,11 +22046,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -17120,6 +22065,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -17135,11 +22081,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -17149,6 +22100,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -17164,11 +22116,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -17178,6 +22135,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -17189,11 +22147,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -17203,6 +22166,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -17216,6 +22180,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17232,6 +22202,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17246,6 +22222,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17260,6 +22242,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17274,6 +22262,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17289,6 +22283,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX11-CU-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17304,6 +22304,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -17319,6 +22325,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -17334,7 +22346,13 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX1250-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -17360,11 +22378,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -17374,6 +22397,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -17389,11 +22413,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -17403,6 +22432,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -17418,11 +22448,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -17432,6 +22467,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -17443,11 +22479,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -17457,6 +22498,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -17470,6 +22512,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17486,6 +22534,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17500,6 +22554,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17514,6 +22574,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17528,6 +22594,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17543,6 +22615,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX11-CU-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17558,6 +22636,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -17573,6 +22657,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -17588,7 +22678,13 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX1250-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -17614,11 +22710,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -17628,6 +22729,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -17643,11 +22745,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -17657,6 +22764,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -17672,11 +22780,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -17686,6 +22799,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -17697,11 +22811,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -17711,6 +22830,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -17724,6 +22844,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17740,6 +22866,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17754,6 +22886,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17768,6 +22906,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17782,6 +22926,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17797,6 +22947,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX11-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17812,6 +22968,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -17827,6 +22989,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -17842,7 +23010,13 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -17869,6 +23043,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -17902,6 +23082,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -17935,6 +23121,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -17964,6 +23156,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -17994,6 +23192,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18013,6 +23217,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18030,6 +23240,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18047,6 +23263,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18064,6 +23286,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg ; GFX11-WGP-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -18083,6 +23311,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg ; GFX11-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -18102,6 +23336,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg ; GFX12-WGP-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -18121,6 +23361,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg ; GFX12-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -18140,7 +23386,13 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg ; GFX1250-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -18171,6 +23423,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -18204,6 +23462,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -18237,6 +23501,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -18266,6 +23536,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -18296,6 +23572,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18315,6 +23597,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18332,6 +23620,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18349,6 +23643,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18366,6 +23666,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -18385,6 +23691,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -18404,6 +23716,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -18423,6 +23741,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -18442,7 +23766,13 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; GFX1250-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -18473,6 +23803,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -18506,6 +23842,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -18539,6 +23881,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -18568,6 +23916,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -18598,6 +23952,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18617,6 +23977,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18634,6 +24000,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18651,6 +24023,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18668,6 +24046,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -18687,6 +24071,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -18706,6 +24096,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -18725,6 +24121,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -18744,7 +24146,13 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX1250-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -18775,6 +24183,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -18808,6 +24222,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -18841,6 +24261,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -18870,6 +24296,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -18900,6 +24332,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18919,6 +24357,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18936,6 +24380,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18953,6 +24403,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18970,6 +24426,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -18989,6 +24451,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -19008,6 +24476,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -19027,6 +24501,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -19046,7 +24526,13 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -19077,6 +24563,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -19110,6 +24602,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -19143,6 +24641,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -19172,6 +24676,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -19202,6 +24712,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19221,6 +24737,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19238,6 +24760,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19255,6 +24783,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19272,6 +24806,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -19291,6 +24831,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -19310,6 +24856,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -19329,6 +24881,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -19348,7 +24906,13 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; GFX1250-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -19379,6 +24943,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -19412,6 +24982,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -19445,6 +25021,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -19474,6 +25056,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -19504,6 +25092,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19523,6 +25117,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19540,6 +25140,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19557,6 +25163,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19574,6 +25186,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -19593,6 +25211,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -19612,6 +25236,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -19631,6 +25261,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -19650,7 +25286,13 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX1250-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -19681,6 +25323,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -19714,6 +25362,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -19747,6 +25401,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -19776,6 +25436,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -19806,6 +25472,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19825,6 +25497,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19842,6 +25520,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19859,6 +25543,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19876,6 +25566,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -19895,6 +25591,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -19914,6 +25616,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -19933,6 +25641,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -19952,7 +25666,13 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX1250-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -19983,6 +25703,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -20016,6 +25742,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -20049,6 +25781,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -20078,6 +25816,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -20108,6 +25852,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20127,6 +25877,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20144,6 +25900,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20161,6 +25923,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20178,6 +25946,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -20197,6 +25971,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -20216,6 +25996,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -20235,6 +26021,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -20254,7 +26046,13 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX1250-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -20285,6 +26083,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -20318,6 +26122,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -20351,6 +26161,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -20380,6 +26196,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -20410,6 +26232,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20429,6 +26257,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20446,6 +26280,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20463,6 +26303,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20480,6 +26326,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -20499,6 +26351,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -20518,6 +26376,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -20537,6 +26401,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -20556,7 +26426,13 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -20587,6 +26463,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -20620,6 +26502,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -20653,6 +26541,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -20682,6 +26576,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -20712,6 +26612,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20731,6 +26637,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20748,6 +26660,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20765,6 +26683,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20782,6 +26706,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -20801,6 +26731,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -20820,6 +26756,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -20839,6 +26781,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -20858,7 +26806,13 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -20889,6 +26843,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -20922,6 +26882,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -20955,6 +26921,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -20984,6 +26956,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -21014,6 +26992,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21033,6 +27017,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21050,6 +27040,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21067,6 +27063,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21084,6 +27086,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -21103,6 +27111,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -21122,6 +27136,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -21141,6 +27161,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -21160,7 +27186,13 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -21191,6 +27223,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -21224,6 +27262,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -21257,6 +27301,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -21286,6 +27336,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -21316,6 +27372,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21335,6 +27397,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21352,6 +27420,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21369,6 +27443,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21386,6 +27466,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -21405,6 +27491,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -21424,6 +27516,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -21443,6 +27541,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -21462,7 +27566,13 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -21493,6 +27603,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -21526,6 +27642,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -21559,6 +27681,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -21588,6 +27716,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -21618,6 +27752,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21637,6 +27777,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21654,6 +27800,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21671,6 +27823,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21688,6 +27846,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -21707,6 +27871,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -21726,6 +27896,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -21745,6 +27921,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -21764,7 +27946,13 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -21795,6 +27983,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -21828,6 +28022,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -21861,6 +28061,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -21890,6 +28096,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -21920,6 +28132,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21939,6 +28157,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21956,6 +28180,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21973,6 +28203,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21990,6 +28226,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -22009,6 +28251,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -22028,6 +28276,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -22047,6 +28301,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -22066,7 +28326,13 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll index 741706e19aa3..d407090a7e52 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll @@ -19,15 +19,19 @@ define amdgpu_kernel void @flat_workgroup_unordered_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -37,15 +41,19 @@ define amdgpu_kernel void @flat_workgroup_unordered_load( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -55,29 +63,37 @@ define amdgpu_kernel void @flat_workgroup_unordered_load( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_unordered_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -85,13 +101,17 @@ define amdgpu_kernel void @flat_workgroup_unordered_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -99,11 +119,15 @@ define amdgpu_kernel void @flat_workgroup_unordered_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -111,23 +135,31 @@ define amdgpu_kernel void @flat_workgroup_unordered_load( ; ; GFX942-NOTTGSPLIT-LABEL: flat_workgroup_unordered_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_unordered_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -135,40 +167,52 @@ define amdgpu_kernel void @flat_workgroup_unordered_load( ; ; GFX11-WGP-LABEL: flat_workgroup_unordered_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_workgroup_unordered_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_workgroup_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -177,12 +221,16 @@ define amdgpu_kernel void @flat_workgroup_unordered_load( ; ; GFX12-CU-LABEL: flat_workgroup_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -192,12 +240,16 @@ define amdgpu_kernel void @flat_workgroup_unordered_load( ; GFX1250-LABEL: flat_workgroup_unordered_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { @@ -213,15 +265,19 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -231,15 +287,19 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -249,29 +309,37 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_monotonic_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -279,13 +347,17 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -293,11 +365,15 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -305,23 +381,31 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load( ; ; GFX942-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_monotonic_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -329,40 +413,52 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load( ; ; GFX11-WGP-LABEL: flat_workgroup_monotonic_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_workgroup_monotonic_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_workgroup_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_SE +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -371,12 +467,16 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load( ; ; GFX12-CU-LABEL: flat_workgroup_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -386,12 +486,16 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load( ; GFX1250-LABEL: flat_workgroup_monotonic_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { @@ -407,9 +511,12 @@ define amdgpu_kernel void @flat_workgroup_acquire_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] @@ -426,9 +533,12 @@ define amdgpu_kernel void @flat_workgroup_acquire_load( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc @@ -445,9 +555,12 @@ define amdgpu_kernel void @flat_workgroup_acquire_load( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] @@ -460,9 +573,12 @@ define amdgpu_kernel void @flat_workgroup_acquire_load( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_acquire_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] @@ -477,9 +593,12 @@ define amdgpu_kernel void @flat_workgroup_acquire_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -492,22 +611,29 @@ define amdgpu_kernel void @flat_workgroup_acquire_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_workgroup_acquire_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -518,22 +644,29 @@ define amdgpu_kernel void @flat_workgroup_acquire_load( ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_acquire_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_workgroup_acquire_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc @@ -546,9 +679,12 @@ define amdgpu_kernel void @flat_workgroup_acquire_load( ; ; GFX11-CU-LABEL: flat_workgroup_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] @@ -561,14 +697,18 @@ define amdgpu_kernel void @flat_workgroup_acquire_load( ; ; GFX12-WGP-LABEL: flat_workgroup_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -576,13 +716,17 @@ define amdgpu_kernel void @flat_workgroup_acquire_load( ; ; GFX12-CU-LABEL: flat_workgroup_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] ; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -592,12 +736,16 @@ define amdgpu_kernel void @flat_workgroup_acquire_load( ; GFX1250-LABEL: flat_workgroup_acquire_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { @@ -613,9 +761,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -633,9 +784,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -654,9 +808,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -670,9 +827,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -688,9 +848,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -704,9 +867,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc @@ -718,9 +884,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load( ; ; GFX942-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 @@ -732,9 +901,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load( ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_seq_cst_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 @@ -746,9 +918,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load( ; ; GFX11-WGP-LABEL: flat_workgroup_seq_cst_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -763,9 +938,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load( ; ; GFX11-CU-LABEL: flat_workgroup_seq_cst_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -779,9 +957,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load( ; ; GFX12-WGP-LABEL: flat_workgroup_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 @@ -791,6 +972,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load( ; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -798,9 +980,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load( ; ; GFX12-CU-LABEL: flat_workgroup_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 @@ -809,6 +994,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -817,14 +1003,18 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load( ; GFX1250-LABEL: flat_workgroup_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { @@ -841,6 +1031,10 @@ define amdgpu_kernel void @flat_workgroup_unordered_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -856,6 +1050,10 @@ define amdgpu_kernel void @flat_workgroup_unordered_store( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 @@ -871,6 +1069,10 @@ define amdgpu_kernel void @flat_workgroup_unordered_store( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 @@ -882,6 +1084,10 @@ define amdgpu_kernel void @flat_workgroup_unordered_store( ; SKIP-CACHE-INV-LABEL: flat_workgroup_unordered_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -895,6 +1101,10 @@ define amdgpu_kernel void @flat_workgroup_unordered_store( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -907,6 +1117,10 @@ define amdgpu_kernel void @flat_workgroup_unordered_store( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -917,6 +1131,10 @@ define amdgpu_kernel void @flat_workgroup_unordered_store( ; GFX942-NOTTGSPLIT-LABEL: flat_workgroup_unordered_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -927,6 +1145,10 @@ define amdgpu_kernel void @flat_workgroup_unordered_store( ; GFX942-TGSPLIT-LABEL: flat_workgroup_unordered_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -937,6 +1159,10 @@ define amdgpu_kernel void @flat_workgroup_unordered_store( ; GFX11-WGP-LABEL: flat_workgroup_unordered_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -948,6 +1174,10 @@ define amdgpu_kernel void @flat_workgroup_unordered_store( ; GFX11-CU-LABEL: flat_workgroup_unordered_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -959,6 +1189,10 @@ define amdgpu_kernel void @flat_workgroup_unordered_store( ; GFX12-WGP-LABEL: flat_workgroup_unordered_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -970,6 +1204,10 @@ define amdgpu_kernel void @flat_workgroup_unordered_store( ; GFX12-CU-LABEL: flat_workgroup_unordered_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -981,12 +1219,16 @@ define amdgpu_kernel void @flat_workgroup_unordered_store( ; GFX1250-LABEL: flat_workgroup_unordered_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { @@ -1002,6 +1244,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -1017,6 +1263,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 @@ -1032,6 +1282,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 @@ -1043,6 +1297,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store( ; SKIP-CACHE-INV-LABEL: flat_workgroup_monotonic_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -1056,6 +1314,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -1068,6 +1330,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -1078,6 +1344,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store( ; GFX942-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -1088,6 +1358,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store( ; GFX942-TGSPLIT-LABEL: flat_workgroup_monotonic_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -1098,6 +1372,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store( ; GFX11-WGP-LABEL: flat_workgroup_monotonic_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -1109,6 +1387,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store( ; GFX11-CU-LABEL: flat_workgroup_monotonic_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -1120,6 +1402,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store( ; GFX12-WGP-LABEL: flat_workgroup_monotonic_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -1131,6 +1417,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store( ; GFX12-CU-LABEL: flat_workgroup_monotonic_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -1142,12 +1432,16 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store( ; GFX1250-LABEL: flat_workgroup_monotonic_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { @@ -1163,6 +1457,10 @@ define amdgpu_kernel void @flat_workgroup_release_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -1179,6 +1477,10 @@ define amdgpu_kernel void @flat_workgroup_release_store( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 @@ -1196,6 +1498,10 @@ define amdgpu_kernel void @flat_workgroup_release_store( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 @@ -1209,6 +1515,10 @@ define amdgpu_kernel void @flat_workgroup_release_store( ; SKIP-CACHE-INV-LABEL: flat_workgroup_release_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -1223,6 +1533,10 @@ define amdgpu_kernel void @flat_workgroup_release_store( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -1236,6 +1550,10 @@ define amdgpu_kernel void @flat_workgroup_release_store( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -1247,6 +1565,10 @@ define amdgpu_kernel void @flat_workgroup_release_store( ; GFX942-NOTTGSPLIT-LABEL: flat_workgroup_release_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -1258,6 +1580,10 @@ define amdgpu_kernel void @flat_workgroup_release_store( ; GFX942-TGSPLIT-LABEL: flat_workgroup_release_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -1269,6 +1595,10 @@ define amdgpu_kernel void @flat_workgroup_release_store( ; GFX11-WGP-LABEL: flat_workgroup_release_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -1282,6 +1612,10 @@ define amdgpu_kernel void @flat_workgroup_release_store( ; GFX11-CU-LABEL: flat_workgroup_release_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -1295,6 +1629,10 @@ define amdgpu_kernel void @flat_workgroup_release_store( ; GFX12-WGP-LABEL: flat_workgroup_release_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -1310,6 +1648,10 @@ define amdgpu_kernel void @flat_workgroup_release_store( ; GFX12-CU-LABEL: flat_workgroup_release_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -1325,14 +1667,18 @@ define amdgpu_kernel void @flat_workgroup_release_store( ; GFX1250-LABEL: flat_workgroup_release_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { @@ -1348,6 +1694,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -1364,6 +1714,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 @@ -1381,6 +1735,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 @@ -1394,6 +1752,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store( ; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -1408,6 +1770,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -1421,6 +1787,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -1432,6 +1802,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store( ; GFX942-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -1443,6 +1817,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store( ; GFX942-TGSPLIT-LABEL: flat_workgroup_seq_cst_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -1454,6 +1832,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store( ; GFX11-WGP-LABEL: flat_workgroup_seq_cst_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -1467,6 +1849,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store( ; GFX11-CU-LABEL: flat_workgroup_seq_cst_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -1480,6 +1866,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store( ; GFX12-WGP-LABEL: flat_workgroup_seq_cst_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -1495,6 +1885,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store( ; GFX12-CU-LABEL: flat_workgroup_seq_cst_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -1510,14 +1904,18 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store( ; GFX1250-LABEL: flat_workgroup_seq_cst_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { @@ -1532,11 +1930,15 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm @@ -1547,11 +1949,15 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm @@ -1562,22 +1968,30 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_monotonic_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -1586,10 +2000,14 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -1598,74 +2016,102 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_monotonic_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_workgroup_monotonic_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_workgroup_monotonic_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_workgroup_monotonic_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_workgroup_monotonic_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm @@ -1673,7 +2119,11 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw( ; GFX1250-LABEL: flat_workgroup_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -1693,11 +2143,15 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1709,11 +2163,15 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1727,11 +2185,15 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1739,11 +2201,15 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_acquire_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -1753,10 +2219,14 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1766,10 +2236,14 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -1778,10 +2252,14 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: flat_workgroup_acquire_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1789,10 +2267,14 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_acquire_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -1801,11 +2283,15 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw( ; ; GFX11-WGP-LABEL: flat_workgroup_acquire_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1815,11 +2301,15 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw( ; ; GFX11-CU-LABEL: flat_workgroup_acquire_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1827,11 +2317,15 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw( ; ; GFX12-WGP-LABEL: flat_workgroup_acquire_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 @@ -1840,11 +2334,15 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw( ; ; GFX12-CU-LABEL: flat_workgroup_acquire_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 @@ -1853,7 +2351,11 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw( ; GFX1250-LABEL: flat_workgroup_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -1874,11 +2376,15 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 @@ -1890,11 +2396,15 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1907,11 +2417,15 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1920,11 +2434,15 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_release_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 @@ -1934,10 +2452,14 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 @@ -1947,10 +2469,14 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 @@ -1958,10 +2484,14 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: flat_workgroup_release_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 @@ -1969,10 +2499,14 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_release_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 @@ -1980,11 +2514,15 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw( ; ; GFX11-WGP-LABEL: flat_workgroup_release_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1993,11 +2531,15 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw( ; ; GFX11-CU-LABEL: flat_workgroup_release_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2006,11 +2548,15 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw( ; ; GFX12-WGP-LABEL: flat_workgroup_release_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 @@ -2021,11 +2567,15 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw( ; ; GFX12-CU-LABEL: flat_workgroup_release_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 @@ -2037,7 +2587,11 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw( ; GFX1250-LABEL: flat_workgroup_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2059,11 +2613,15 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 @@ -2076,11 +2634,15 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2096,11 +2658,15 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2111,11 +2677,15 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_acq_rel_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 @@ -2126,10 +2696,14 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 @@ -2140,10 +2714,14 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 @@ -2153,10 +2731,14 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 @@ -2165,10 +2747,14 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_acq_rel_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 @@ -2178,11 +2764,15 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw( ; ; GFX11-WGP-LABEL: flat_workgroup_acq_rel_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2194,11 +2784,15 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw( ; ; GFX11-CU-LABEL: flat_workgroup_acq_rel_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2209,11 +2803,15 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw( ; ; GFX12-WGP-LABEL: flat_workgroup_acq_rel_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 @@ -2226,11 +2824,15 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw( ; ; GFX12-CU-LABEL: flat_workgroup_acq_rel_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 @@ -2243,7 +2845,11 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw( ; GFX1250-LABEL: flat_workgroup_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2266,11 +2872,15 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 @@ -2283,11 +2893,15 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2303,11 +2917,15 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2318,11 +2936,15 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 @@ -2333,10 +2955,14 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 @@ -2347,10 +2973,14 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 @@ -2360,10 +2990,14 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 @@ -2372,10 +3006,14 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_seq_cst_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 @@ -2385,11 +3023,15 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw( ; ; GFX11-WGP-LABEL: flat_workgroup_seq_cst_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2401,11 +3043,15 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw( ; ; GFX11-CU-LABEL: flat_workgroup_seq_cst_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2416,11 +3062,15 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw( ; ; GFX12-WGP-LABEL: flat_workgroup_seq_cst_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 @@ -2433,11 +3083,15 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw( ; ; GFX12-CU-LABEL: flat_workgroup_seq_cst_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 @@ -2450,7 +3104,11 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw( ; GFX1250-LABEL: flat_workgroup_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2474,6 +3132,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -2494,6 +3156,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -2514,6 +3180,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -2530,6 +3200,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: flat_workgroup_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -2548,6 +3222,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] @@ -2564,6 +3242,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] @@ -2578,6 +3260,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: flat_workgroup_acquire_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -2592,6 +3278,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: flat_workgroup_acquire_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -2606,6 +3296,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw( ; GFX11-WGP-LABEL: flat_workgroup_acquire_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -2622,6 +3316,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw( ; GFX11-CU-LABEL: flat_workgroup_acquire_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2638,6 +3336,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw( ; GFX12-WGP-LABEL: flat_workgroup_acquire_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -2654,6 +3356,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw( ; GFX12-CU-LABEL: flat_workgroup_acquire_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2670,7 +3376,11 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw( ; GFX1250-LABEL: flat_workgroup_acquire_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2694,6 +3404,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -2715,6 +3429,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -2737,6 +3455,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -2754,6 +3476,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: flat_workgroup_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -2773,6 +3499,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] @@ -2790,6 +3520,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] @@ -2805,6 +3539,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -2820,6 +3558,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: flat_workgroup_acq_rel_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -2835,6 +3577,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw( ; GFX11-WGP-LABEL: flat_workgroup_acq_rel_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -2853,6 +3599,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw( ; GFX11-CU-LABEL: flat_workgroup_acq_rel_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2870,6 +3620,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw( ; GFX12-WGP-LABEL: flat_workgroup_acq_rel_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -2890,6 +3644,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw( ; GFX12-CU-LABEL: flat_workgroup_acq_rel_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2909,7 +3667,11 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw( ; GFX1250-LABEL: flat_workgroup_acq_rel_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2935,6 +3697,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -2956,6 +3722,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -2978,6 +3748,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -2995,6 +3769,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -3014,6 +3792,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] @@ -3031,6 +3813,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] @@ -3046,6 +3832,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -3061,6 +3851,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: flat_workgroup_seq_cst_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -3076,6 +3870,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw( ; GFX11-WGP-LABEL: flat_workgroup_seq_cst_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -3094,6 +3892,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw( ; GFX11-CU-LABEL: flat_workgroup_seq_cst_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -3111,6 +3913,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw( ; GFX12-WGP-LABEL: flat_workgroup_seq_cst_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -3131,6 +3937,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw( ; GFX12-CU-LABEL: flat_workgroup_seq_cst_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -3150,7 +3960,11 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw( ; GFX1250-LABEL: flat_workgroup_seq_cst_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -3176,11 +3990,16 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -3190,6 +4009,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -3205,11 +4025,16 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -3219,6 +4044,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -3234,11 +4060,16 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -3248,6 +4079,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -3259,11 +4091,16 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -3273,6 +4110,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -3286,6 +4124,12 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3302,6 +4146,12 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3316,6 +4166,12 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3330,6 +4186,12 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3344,6 +4206,12 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg( ; GFX11-WGP-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -3359,6 +4227,12 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg( ; GFX11-CU-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -3374,6 +4248,12 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -3389,6 +4269,12 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -3404,7 +4290,13 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg( ; GFX1250-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -3430,11 +4322,16 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -3444,6 +4341,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -3460,11 +4358,16 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -3474,6 +4377,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -3492,11 +4396,16 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -3506,6 +4415,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -3518,11 +4428,16 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -3532,6 +4447,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -3546,6 +4462,12 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3563,6 +4485,12 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3579,6 +4507,12 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3594,6 +4528,12 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3610,6 +4550,12 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg( ; GFX11-WGP-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -3628,6 +4574,12 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg( ; GFX11-CU-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -3644,6 +4596,12 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -3661,6 +4619,12 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -3677,7 +4641,13 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg( ; GFX1250-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -3704,11 +4674,16 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -3718,6 +4693,7 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -3734,11 +4710,16 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -3748,6 +4729,7 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -3765,11 +4747,16 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -3779,6 +4766,7 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -3792,11 +4780,16 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_workgroup_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -3806,6 +4799,7 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -3820,6 +4814,12 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3837,6 +4837,12 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3852,6 +4858,12 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_workgroup_release_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3867,6 +4879,12 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_workgroup_release_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3882,6 +4900,12 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg( ; GFX11-WGP-LABEL: flat_workgroup_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -3899,6 +4923,12 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg( ; GFX11-CU-LABEL: flat_workgroup_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -3916,6 +4946,12 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -3935,6 +4971,12 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -3954,7 +4996,13 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg( ; GFX1250-LABEL: flat_workgroup_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -3982,11 +5030,16 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -3996,6 +5049,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -4013,11 +5067,16 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -4027,6 +5086,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -4047,11 +5107,16 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -4061,6 +5126,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -4076,11 +5142,16 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -4090,6 +5161,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -4105,6 +5177,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4123,6 +5201,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4140,6 +5224,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4156,6 +5246,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4173,6 +5269,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg( ; GFX11-WGP-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4193,6 +5295,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg( ; GFX11-CU-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4212,6 +5320,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -4233,6 +5347,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -4253,7 +5373,13 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg( ; GFX1250-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -4282,11 +5408,16 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -4296,6 +5427,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -4313,11 +5445,16 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -4327,6 +5464,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -4347,11 +5485,16 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -4361,6 +5504,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -4376,11 +5520,16 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -4390,6 +5539,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -4405,6 +5555,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4423,6 +5579,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4440,6 +5602,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4456,6 +5624,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4473,6 +5647,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg( ; GFX11-WGP-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4493,6 +5673,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg( ; GFX11-CU-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4512,6 +5698,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -4533,6 +5725,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -4553,7 +5751,13 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg( ; GFX1250-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -4582,11 +5786,16 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -4596,6 +5805,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -4612,11 +5822,16 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -4626,6 +5841,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -4644,11 +5860,16 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -4658,6 +5879,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -4670,11 +5892,16 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -4684,6 +5911,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -4698,6 +5926,12 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4715,6 +5949,12 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4731,6 +5971,12 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4746,6 +5992,12 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4762,6 +6014,12 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg( ; GFX11-WGP-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4780,6 +6038,12 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg( ; GFX11-CU-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4796,6 +6060,12 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -4813,6 +6083,12 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -4829,7 +6105,13 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg( ; GFX1250-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -4856,11 +6138,16 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -4870,6 +6157,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -4886,11 +6174,16 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -4900,6 +6193,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -4918,11 +6212,16 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -4932,6 +6231,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -4944,11 +6244,16 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_workgroup_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -4958,6 +6263,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -4972,6 +6278,12 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4989,6 +6301,12 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5005,6 +6323,12 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_workgroup_acquire_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5020,6 +6344,12 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_workgroup_acquire_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5036,6 +6366,12 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg( ; GFX11-WGP-LABEL: flat_workgroup_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5054,6 +6390,12 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg( ; GFX11-CU-LABEL: flat_workgroup_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5070,6 +6412,12 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -5087,6 +6435,12 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -5103,7 +6457,13 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg( ; GFX1250-LABEL: flat_workgroup_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -5130,11 +6490,16 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -5144,6 +6509,7 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -5161,11 +6527,16 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -5175,6 +6546,7 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -5195,11 +6567,16 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -5209,6 +6586,7 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -5224,11 +6602,16 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_workgroup_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -5238,6 +6621,7 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -5253,6 +6637,12 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5271,6 +6661,12 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5288,6 +6684,12 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_workgroup_release_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5304,6 +6706,12 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_workgroup_release_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5321,6 +6729,12 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg( ; GFX11-WGP-LABEL: flat_workgroup_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5341,6 +6755,12 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg( ; GFX11-CU-LABEL: flat_workgroup_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5360,6 +6780,12 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -5381,6 +6807,12 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -5401,7 +6833,13 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg( ; GFX1250-LABEL: flat_workgroup_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -5430,11 +6868,16 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -5444,6 +6887,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -5461,11 +6905,16 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -5475,6 +6924,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -5495,11 +6945,16 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -5509,6 +6964,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -5524,11 +6980,16 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -5538,6 +6999,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -5553,6 +7015,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5571,6 +7039,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5588,6 +7062,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5604,6 +7084,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5621,6 +7107,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg( ; GFX11-WGP-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5641,6 +7133,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg( ; GFX11-CU-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5660,6 +7158,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -5681,6 +7185,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -5701,7 +7211,13 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg( ; GFX1250-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -5730,11 +7246,16 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -5744,6 +7265,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -5761,11 +7283,16 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -5775,6 +7302,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -5795,11 +7323,16 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -5809,6 +7342,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -5824,11 +7358,16 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -5838,6 +7377,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -5853,6 +7393,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5871,6 +7417,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5888,6 +7440,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5904,6 +7462,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5921,6 +7485,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg( ; GFX11-WGP-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5941,6 +7511,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg( ; GFX11-CU-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5960,6 +7536,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -5981,6 +7563,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -6001,7 +7589,13 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg( ; GFX1250-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -6030,11 +7624,16 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -6044,6 +7643,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -6061,11 +7661,16 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -6075,6 +7680,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -6095,11 +7701,16 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -6109,6 +7720,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -6124,11 +7736,16 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -6138,6 +7755,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -6153,6 +7771,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6171,6 +7795,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6188,6 +7818,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6204,6 +7840,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6221,6 +7863,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -6241,6 +7889,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX11-CU-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6260,6 +7914,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -6281,6 +7941,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -6301,7 +7967,13 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX1250-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -6331,6 +8003,12 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -6364,6 +8042,12 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -6397,6 +8081,12 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -6426,6 +8116,12 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -6456,6 +8152,12 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6475,6 +8177,12 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6492,6 +8200,12 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6509,6 +8223,12 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6526,6 +8246,12 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -6545,6 +8271,12 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6564,6 +8296,12 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -6583,6 +8321,12 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -6602,7 +8346,13 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg( ; GFX1250-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -6633,6 +8383,12 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -6667,6 +8423,12 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -6701,6 +8463,12 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -6731,6 +8499,12 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -6762,6 +8536,12 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6782,6 +8562,12 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6800,6 +8586,12 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6818,6 +8610,12 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6836,6 +8634,12 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -6856,6 +8660,12 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6876,6 +8686,12 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -6896,6 +8712,12 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -6916,7 +8738,13 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX1250-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -6947,6 +8775,12 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -6981,6 +8815,12 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -7016,6 +8856,12 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -7047,6 +8893,12 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -7078,6 +8930,12 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7098,6 +8956,12 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7116,6 +8980,12 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7134,6 +9004,12 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7152,6 +9028,12 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7173,6 +9055,12 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -7194,6 +9082,12 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -7217,6 +9111,12 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -7240,7 +9140,13 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg( ; GFX1250-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -7273,6 +9179,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -7308,6 +9220,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -7344,6 +9262,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -7375,6 +9299,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -7407,6 +9337,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7428,6 +9364,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7447,6 +9389,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7466,6 +9414,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7485,6 +9439,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7507,6 +9467,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -7528,6 +9494,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -7552,6 +9524,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -7575,7 +9553,13 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX1250-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -7608,6 +9592,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -7643,6 +9633,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -7679,6 +9675,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -7710,6 +9712,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -7742,6 +9750,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7763,6 +9777,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7782,6 +9802,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7801,6 +9827,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7820,6 +9852,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7842,6 +9880,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -7863,6 +9907,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -7887,6 +9937,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -7910,7 +9966,13 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX1250-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -7943,6 +10005,12 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -7977,6 +10045,12 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -8011,6 +10085,12 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -8041,6 +10121,12 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -8072,6 +10158,12 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8092,6 +10184,12 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8110,6 +10208,12 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8128,6 +10232,12 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8146,6 +10256,12 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -8166,6 +10282,12 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -8186,6 +10308,12 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -8206,6 +10334,12 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -8226,7 +10360,13 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX1250-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -8257,6 +10397,12 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -8291,6 +10437,12 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -8325,6 +10477,12 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -8355,6 +10513,12 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -8386,6 +10550,12 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8406,6 +10576,12 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8424,6 +10600,12 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8442,6 +10624,12 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8460,6 +10648,12 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -8480,6 +10674,12 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -8500,6 +10700,12 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -8520,6 +10726,12 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -8540,7 +10752,13 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg( ; GFX1250-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -8571,6 +10789,12 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -8606,6 +10830,12 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -8642,6 +10872,12 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -8673,6 +10909,12 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -8705,6 +10947,12 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8726,6 +10974,12 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8745,6 +10999,12 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8764,6 +11024,12 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8783,6 +11049,12 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -8805,6 +11077,12 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -8826,6 +11104,12 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -8850,6 +11134,12 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -8873,7 +11163,13 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg( ; GFX1250-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -8906,6 +11202,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -8941,6 +11243,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -8977,6 +11285,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -9008,6 +11322,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -9040,6 +11360,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9061,6 +11387,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9080,6 +11412,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9099,6 +11437,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9118,6 +11462,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -9140,6 +11490,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -9161,6 +11517,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -9185,6 +11547,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -9208,7 +11576,13 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX1250-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -9241,6 +11615,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -9276,6 +11656,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -9312,6 +11698,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -9343,6 +11735,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -9375,6 +11773,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9396,6 +11800,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9415,6 +11825,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9434,6 +11850,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9453,6 +11875,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -9475,6 +11903,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -9496,6 +11930,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -9520,6 +11960,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -9543,7 +11989,13 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX1250-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -9576,6 +12028,12 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -9611,6 +12069,12 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -9647,6 +12111,12 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -9679,6 +12149,12 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -9711,6 +12187,12 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9732,6 +12214,12 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9751,6 +12239,12 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9770,6 +12264,12 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9789,6 +12289,12 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -9811,6 +12317,12 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -9833,6 +12345,12 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -9857,6 +12375,12 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -9881,7 +12405,13 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -9914,6 +12444,12 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -9949,6 +12485,12 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -9985,6 +12527,12 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -10017,6 +12565,12 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -10049,6 +12603,12 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10070,6 +12630,12 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10089,6 +12655,12 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10108,6 +12680,12 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10127,6 +12705,12 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10149,6 +12733,12 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10171,6 +12761,12 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -10195,6 +12791,12 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -10219,7 +12821,13 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -10252,6 +12860,12 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -10287,6 +12901,12 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -10323,6 +12943,12 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -10354,6 +12980,12 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -10386,6 +13018,12 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10407,6 +13045,12 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10426,6 +13070,12 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10445,6 +13095,12 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10464,6 +13120,12 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10486,6 +13148,12 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10507,6 +13175,12 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -10531,6 +13205,12 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -10554,7 +13234,13 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -10587,6 +13273,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -10622,6 +13314,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -10658,6 +13356,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -10689,6 +13393,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -10721,6 +13431,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10742,6 +13458,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10761,6 +13483,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10780,6 +13508,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10799,6 +13533,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10821,6 +13561,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10842,6 +13588,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -10866,6 +13618,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -10889,7 +13647,13 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -10922,6 +13686,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -10957,6 +13727,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -10993,6 +13769,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -11024,6 +13806,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -11056,6 +13844,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11077,6 +13871,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11096,6 +13896,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11115,6 +13921,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11134,6 +13946,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11156,6 +13974,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11177,6 +14001,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -11201,6 +14031,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -11224,7 +14060,13 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -11255,15 +14097,19 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -11273,15 +14119,19 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -11291,29 +14141,37 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_unordered_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -11321,13 +14179,17 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11335,11 +14197,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -11347,23 +14213,31 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load( ; ; GFX942-NOTTGSPLIT-LABEL: flat_workgroup_one_as_unordered_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_unordered_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -11371,40 +14245,52 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load( ; ; GFX11-WGP-LABEL: flat_workgroup_one_as_unordered_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_workgroup_one_as_unordered_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -11413,12 +14299,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load( ; ; GFX12-CU-LABEL: flat_workgroup_one_as_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -11428,12 +14318,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load( ; GFX1250-LABEL: flat_workgroup_one_as_unordered_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { @@ -11449,15 +14343,19 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -11467,15 +14365,19 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -11485,29 +14387,37 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_monotonic_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -11515,13 +14425,17 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11529,11 +14443,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -11541,23 +14459,31 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load( ; ; GFX942-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -11565,40 +14491,52 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load( ; ; GFX11-WGP-LABEL: flat_workgroup_one_as_monotonic_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_workgroup_one_as_monotonic_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_SE +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -11607,12 +14545,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load( ; ; GFX12-CU-LABEL: flat_workgroup_one_as_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -11622,12 +14564,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load( ; GFX1250-LABEL: flat_workgroup_one_as_monotonic_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { @@ -11643,15 +14589,19 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -11661,17 +14611,20 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -11681,29 +14634,37 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acquire_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -11711,13 +14672,17 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11725,82 +14690,105 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_workgroup_one_as_acquire_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_workgroup_one_as_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0 @@ -11809,12 +14797,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load( ; ; GFX12-CU-LABEL: flat_workgroup_one_as_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -11824,12 +14816,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load( ; GFX1250-LABEL: flat_workgroup_one_as_acquire_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { @@ -11845,15 +14841,19 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -11863,9 +14863,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) @@ -11873,9 +14876,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load( ; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -11885,32 +14888,38 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -11918,13 +14927,17 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11932,49 +14945,64 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_workgroup_one_as_seq_cst_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) @@ -11982,34 +15010,39 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load( ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_workgroup_one_as_seq_cst_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 @@ -12019,6 +15052,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load( ; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0 @@ -12027,9 +15061,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load( ; ; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 @@ -12038,6 +15075,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 @@ -12047,14 +15085,18 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load( ; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { @@ -12071,6 +15113,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -12086,6 +15132,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 @@ -12101,6 +15151,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 @@ -12112,6 +15166,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store( ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_unordered_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -12125,6 +15183,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -12137,6 +15199,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -12147,6 +15213,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store( ; GFX942-NOTTGSPLIT-LABEL: flat_workgroup_one_as_unordered_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -12157,6 +15227,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store( ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_unordered_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -12167,6 +15241,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store( ; GFX11-WGP-LABEL: flat_workgroup_one_as_unordered_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -12178,6 +15256,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store( ; GFX11-CU-LABEL: flat_workgroup_one_as_unordered_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -12189,6 +15271,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store( ; GFX12-WGP-LABEL: flat_workgroup_one_as_unordered_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -12200,6 +15286,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store( ; GFX12-CU-LABEL: flat_workgroup_one_as_unordered_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -12211,12 +15301,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store( ; GFX1250-LABEL: flat_workgroup_one_as_unordered_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { @@ -12232,6 +15326,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -12247,6 +15345,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 @@ -12262,6 +15364,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 @@ -12273,6 +15379,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store( ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_monotonic_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -12286,6 +15396,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -12298,6 +15412,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -12308,6 +15426,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store( ; GFX942-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -12318,6 +15440,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store( ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -12328,6 +15454,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store( ; GFX11-WGP-LABEL: flat_workgroup_one_as_monotonic_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -12339,6 +15469,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store( ; GFX11-CU-LABEL: flat_workgroup_one_as_monotonic_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -12350,6 +15484,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store( ; GFX12-WGP-LABEL: flat_workgroup_one_as_monotonic_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -12361,6 +15499,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store( ; GFX12-CU-LABEL: flat_workgroup_one_as_monotonic_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -12372,12 +15514,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store( ; GFX1250-LABEL: flat_workgroup_one_as_monotonic_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { @@ -12393,6 +15539,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -12408,6 +15558,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 @@ -12425,6 +15579,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 @@ -12438,6 +15596,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store( ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_release_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -12451,6 +15613,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -12463,6 +15629,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -12474,6 +15644,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store( ; GFX942-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -12484,6 +15658,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store( ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_release_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -12495,6 +15673,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store( ; GFX11-WGP-LABEL: flat_workgroup_one_as_release_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -12508,6 +15690,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store( ; GFX11-CU-LABEL: flat_workgroup_one_as_release_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -12521,6 +15707,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store( ; GFX12-WGP-LABEL: flat_workgroup_one_as_release_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -12536,6 +15726,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store( ; GFX12-CU-LABEL: flat_workgroup_one_as_release_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -12551,14 +15745,18 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store( ; GFX1250-LABEL: flat_workgroup_one_as_release_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { @@ -12574,6 +15772,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -12589,6 +15791,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 @@ -12606,6 +15812,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 @@ -12619,6 +15829,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store( ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -12632,6 +15846,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -12644,6 +15862,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] @@ -12655,6 +15877,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store( ; GFX942-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -12665,6 +15891,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store( ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -12676,6 +15906,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store( ; GFX11-WGP-LABEL: flat_workgroup_one_as_seq_cst_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -12689,6 +15923,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store( ; GFX11-CU-LABEL: flat_workgroup_one_as_seq_cst_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -12702,6 +15940,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store( ; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 @@ -12717,6 +15959,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store( ; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 @@ -12732,14 +15978,18 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store( ; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { @@ -12754,11 +16004,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm @@ -12769,11 +16023,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm @@ -12784,22 +16042,30 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_monotonic_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -12808,10 +16074,14 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -12820,74 +16090,102 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_workgroup_one_as_monotonic_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_workgroup_one_as_monotonic_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_monotonic_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_workgroup_one_as_monotonic_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm @@ -12895,7 +16193,11 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw( ; GFX1250-LABEL: flat_workgroup_one_as_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -12915,11 +16217,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm @@ -12930,11 +16236,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -12947,22 +16257,30 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acquire_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -12971,10 +16289,14 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -12983,10 +16305,14 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -12995,20 +16321,28 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -13017,11 +16351,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw( ; ; GFX11-WGP-LABEL: flat_workgroup_one_as_acquire_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -13030,22 +16368,30 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw( ; ; GFX11-CU-LABEL: flat_workgroup_one_as_acquire_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -13054,11 +16400,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw( ; ; GFX12-CU-LABEL: flat_workgroup_one_as_acquire_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm @@ -13066,7 +16416,11 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw( ; GFX1250-LABEL: flat_workgroup_one_as_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -13087,11 +16441,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm @@ -13102,11 +16460,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -13119,11 +16481,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -13132,11 +16498,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_release_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -13145,10 +16515,14 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -13157,10 +16531,14 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 @@ -13168,20 +16546,28 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_release_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 @@ -13189,11 +16575,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw( ; ; GFX11-WGP-LABEL: flat_workgroup_one_as_release_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -13202,11 +16592,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw( ; ; GFX11-CU-LABEL: flat_workgroup_one_as_release_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -13215,11 +16609,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw( ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_release_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 @@ -13230,11 +16628,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw( ; ; GFX12-CU-LABEL: flat_workgroup_one_as_release_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 @@ -13246,7 +16648,11 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw( ; GFX1250-LABEL: flat_workgroup_one_as_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -13268,11 +16674,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm @@ -13283,11 +16693,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -13302,11 +16716,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -13316,11 +16734,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -13329,10 +16751,14 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -13341,10 +16767,14 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 @@ -13354,20 +16784,28 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 @@ -13377,11 +16815,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw( ; ; GFX11-WGP-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -13392,11 +16834,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw( ; ; GFX11-CU-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -13406,11 +16852,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw( ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 @@ -13423,11 +16873,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw( ; ; GFX12-CU-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 @@ -13440,7 +16894,11 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw( ; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -13463,11 +16921,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm @@ -13478,11 +16940,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw( ; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -13497,11 +16963,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw( ; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -13511,11 +16981,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -13524,10 +16998,14 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -13536,10 +17014,14 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 @@ -13549,20 +17031,28 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 @@ -13572,11 +17062,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw( ; ; GFX11-WGP-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -13587,11 +17081,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw( ; ; GFX11-CU-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -13601,11 +17099,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw( ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 @@ -13618,11 +17120,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw( ; ; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 @@ -13635,7 +17141,11 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw( ; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -13659,6 +17169,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -13678,6 +17192,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -13699,6 +17217,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -13714,6 +17236,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -13731,6 +17257,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] @@ -13746,6 +17276,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] @@ -13760,6 +17294,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -13773,6 +17311,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -13787,6 +17329,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw( ; GFX11-WGP-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -13804,6 +17350,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw( ; GFX11-CU-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -13819,6 +17369,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw( ; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -13836,6 +17390,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw( ; GFX12-CU-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -13851,7 +17409,11 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw( ; GFX1250-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -13875,6 +17437,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -13894,6 +17460,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -13917,6 +17487,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -13935,6 +17509,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -13952,6 +17530,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] @@ -13967,6 +17549,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] @@ -13982,6 +17568,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -13995,6 +17585,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -14010,6 +17604,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX11-WGP-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -14029,6 +17627,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX11-CU-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -14047,6 +17649,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -14068,6 +17674,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -14088,7 +17698,11 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -14114,6 +17728,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -14133,6 +17751,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -14156,6 +17778,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -14174,6 +17800,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -14191,6 +17821,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] @@ -14206,6 +17840,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] @@ -14221,6 +17859,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -14234,6 +17876,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -14249,6 +17895,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX11-WGP-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -14268,6 +17918,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX11-CU-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -14286,6 +17940,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -14307,6 +17965,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -14327,7 +17989,11 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -14353,11 +18019,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -14367,6 +18038,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -14382,11 +18054,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -14396,6 +18073,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -14411,11 +18089,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -14425,6 +18108,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -14436,11 +18120,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -14450,6 +18139,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -14463,6 +18153,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14479,6 +18175,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14493,6 +18195,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14507,6 +18215,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14521,6 +18235,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg( ; GFX11-WGP-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -14536,6 +18256,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg( ; GFX11-CU-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -14551,6 +18277,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -14566,6 +18298,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -14581,7 +18319,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg( ; GFX1250-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -14607,11 +18351,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -14621,6 +18370,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -14636,11 +18386,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -14650,6 +18405,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -14667,11 +18423,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -14681,6 +18442,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -14692,11 +18454,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -14706,6 +18473,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -14719,6 +18487,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14735,6 +18509,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14751,6 +18531,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14765,6 +18551,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14781,6 +18573,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX11-WGP-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -14798,6 +18596,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX11-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -14813,6 +18617,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -14830,6 +18640,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -14845,7 +18661,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX1250-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -14872,11 +18694,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -14886,6 +18713,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -14901,11 +18729,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -14915,6 +18748,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -14932,11 +18766,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -14946,6 +18785,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -14959,11 +18799,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -14973,6 +18818,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -14986,6 +18832,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15002,6 +18854,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15017,6 +18875,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15031,6 +18895,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15046,6 +18916,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( ; GFX11-WGP-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -15063,6 +18939,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( ; GFX11-CU-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -15080,6 +18962,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -15099,6 +18987,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -15118,7 +19012,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( ; GFX1250-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -15146,11 +19046,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -15160,6 +19065,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -15175,11 +19081,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -15189,6 +19100,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -15208,11 +19120,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -15222,6 +19139,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -15236,11 +19154,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -15250,6 +19173,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -15263,6 +19187,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15279,6 +19209,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15296,6 +19232,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15310,6 +19252,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15327,6 +19275,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX11-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -15346,6 +19300,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX11-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -15364,6 +19324,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -15385,6 +19351,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -15405,7 +19377,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -15434,11 +19412,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -15448,6 +19431,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -15463,11 +19447,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -15477,6 +19466,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -15496,11 +19486,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -15510,6 +19505,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -15524,11 +19520,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -15538,6 +19539,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -15551,6 +19553,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15567,6 +19575,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15584,6 +19598,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15598,6 +19618,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15615,6 +19641,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX11-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -15634,6 +19666,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX11-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -15652,6 +19690,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -15673,6 +19717,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -15693,7 +19743,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -15722,11 +19778,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -15736,6 +19797,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -15751,11 +19813,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -15765,6 +19832,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -15782,11 +19850,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -15796,6 +19869,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -15807,11 +19881,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -15821,6 +19900,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -15834,6 +19914,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15850,6 +19936,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15866,6 +19958,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15880,6 +19978,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15896,6 +20000,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX11-WGP-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -15913,6 +20023,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX11-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -15928,6 +20044,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -15945,6 +20067,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -15960,7 +20088,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX1250-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -15987,11 +20121,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -16001,6 +20140,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -16016,11 +20156,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -16030,6 +20175,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -16047,11 +20193,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -16061,6 +20212,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -16072,11 +20224,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -16086,6 +20243,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -16099,6 +20257,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16115,6 +20279,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16131,6 +20301,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16145,6 +20321,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16161,6 +20343,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX11-WGP-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16178,6 +20366,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX11-CU-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16193,6 +20387,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -16210,6 +20410,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -16225,7 +20431,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX1250-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -16252,11 +20464,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -16266,6 +20483,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -16281,11 +20499,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -16295,6 +20518,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -16314,11 +20538,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -16328,6 +20557,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -16342,11 +20572,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -16356,6 +20591,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -16369,6 +20605,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16385,6 +20627,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16402,6 +20650,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16416,6 +20670,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16433,6 +20693,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; GFX11-WGP-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16452,6 +20718,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; GFX11-CU-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16470,6 +20742,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -16491,6 +20769,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -16511,7 +20795,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; GFX1250-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -16540,11 +20830,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -16554,6 +20849,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -16569,11 +20865,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -16583,6 +20884,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -16602,11 +20904,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -16616,6 +20923,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -16630,11 +20938,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -16644,6 +20957,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -16657,6 +20971,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16673,6 +20993,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16690,6 +21016,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16704,6 +21036,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16721,6 +21059,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX11-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16740,6 +21084,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX11-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16758,6 +21108,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -16779,6 +21135,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -16799,7 +21161,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -16828,11 +21196,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -16842,6 +21215,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -16857,11 +21231,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -16871,6 +21250,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -16890,11 +21270,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -16904,6 +21289,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -16918,11 +21304,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -16932,6 +21323,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -16945,6 +21337,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16961,6 +21359,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16978,6 +21382,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16992,6 +21402,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17009,6 +21425,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX11-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17028,6 +21450,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX11-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17046,6 +21474,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -17067,6 +21501,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -17087,7 +21527,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -17116,11 +21562,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -17130,6 +21581,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -17145,11 +21597,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -17159,6 +21616,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -17178,11 +21636,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -17192,6 +21655,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -17205,11 +21669,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -17219,6 +21688,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -17232,6 +21702,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17248,6 +21724,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17265,6 +21747,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17279,6 +21767,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17296,6 +21790,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17315,6 +21815,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX11-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17332,6 +21838,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -17353,6 +21865,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -17372,7 +21890,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX1250-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -17401,11 +21925,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -17415,6 +21944,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -17430,11 +21960,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -17444,6 +21979,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -17463,11 +21999,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -17477,6 +22018,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -17490,11 +22032,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -17504,6 +22051,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -17517,6 +22065,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17533,6 +22087,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17550,6 +22110,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17564,6 +22130,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17581,6 +22153,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17600,6 +22178,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX11-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17617,6 +22201,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -17638,6 +22228,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -17657,7 +22253,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX1250-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -17686,11 +22288,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -17700,6 +22307,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -17715,11 +22323,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -17729,6 +22342,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -17748,11 +22362,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -17762,6 +22381,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -17776,11 +22396,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -17790,6 +22415,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -17803,6 +22429,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17819,6 +22451,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17836,6 +22474,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17850,6 +22494,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17867,6 +22517,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17886,6 +22542,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX11-CU-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17904,6 +22566,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -17925,6 +22593,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -17945,7 +22619,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX1250-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -17974,11 +22654,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -17988,6 +22673,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -18003,11 +22689,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -18017,6 +22708,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -18036,11 +22728,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -18050,6 +22747,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -18064,11 +22762,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -18078,6 +22781,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -18091,6 +22795,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18107,6 +22817,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18124,6 +22840,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18138,6 +22860,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18155,6 +22883,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -18174,6 +22908,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX11-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -18192,6 +22932,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -18213,6 +22959,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -18233,7 +22985,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -18262,11 +23020,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -18276,6 +23039,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -18291,11 +23055,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_mov_b32 s4, s8 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10 @@ -18305,6 +23074,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 @@ -18324,11 +23094,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_mov_b32 s4, s8 ; GFX10-CU-NEXT: s_mov_b32 s5, s9 ; GFX10-CU-NEXT: s_mov_b32 s9, s10 @@ -18338,6 +23113,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX10-CU-NEXT: s_mov_b32 s5, s8 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 @@ -18352,11 +23128,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 @@ -18366,6 +23147,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 @@ -18379,6 +23161,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18395,6 +23183,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18412,6 +23206,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18426,6 +23226,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18443,6 +23249,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -18462,6 +23274,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX11-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -18480,6 +23298,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -18501,6 +23325,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -18521,7 +23351,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -18551,6 +23387,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -18584,6 +23426,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -18617,6 +23465,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -18646,6 +23500,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -18676,6 +23536,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18695,6 +23561,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18712,6 +23584,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18729,6 +23607,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18746,6 +23630,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -18765,6 +23655,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg( ; GFX11-CU-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -18784,6 +23680,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -18803,6 +23705,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -18822,7 +23730,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg( ; GFX1250-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -18853,6 +23767,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -18886,6 +23806,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -18921,6 +23847,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -18950,6 +23882,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -18980,6 +23918,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18999,6 +23943,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19017,6 +23967,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19034,6 +23990,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19052,6 +24014,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -19073,6 +24041,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -19092,6 +24066,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -19113,6 +24093,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -19132,7 +24118,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; GFX1250-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -19163,6 +24155,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -19196,6 +24194,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -19231,6 +24235,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -19262,6 +24272,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -19292,6 +24308,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19311,6 +24333,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19329,6 +24357,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19346,6 +24380,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19364,6 +24404,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -19385,6 +24431,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -19406,6 +24458,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -19429,6 +24487,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -19452,7 +24516,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX1250-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -19485,6 +24555,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -19518,6 +24594,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -19555,6 +24637,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -19587,6 +24675,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -19617,6 +24711,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19636,6 +24736,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19655,6 +24761,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19672,6 +24784,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19691,6 +24809,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -19714,6 +24838,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -19736,6 +24866,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -19761,6 +24897,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -19785,7 +24927,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -19818,6 +24966,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -19851,6 +25005,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -19888,6 +25048,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -19920,6 +25086,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -19950,6 +25122,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19969,6 +25147,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19988,6 +25172,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20005,6 +25195,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20024,6 +25220,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -20047,6 +25249,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -20069,6 +25277,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -20094,6 +25308,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -20118,7 +25338,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -20151,6 +25377,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -20184,6 +25416,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -20219,6 +25457,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -20248,6 +25492,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -20278,6 +25528,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20297,6 +25553,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20315,6 +25577,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20332,6 +25600,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20350,6 +25624,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -20371,6 +25651,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -20390,6 +25676,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -20411,6 +25703,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -20430,7 +25728,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; GFX1250-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -20461,6 +25765,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -20494,6 +25804,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -20529,6 +25845,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -20558,6 +25880,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -20588,6 +25916,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20607,6 +25941,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20625,6 +25965,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20642,6 +25988,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20660,6 +26012,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -20681,6 +26039,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -20700,6 +26064,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -20721,6 +26091,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -20740,7 +26116,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX1250-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -20771,6 +26153,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -20804,6 +26192,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -20841,6 +26235,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -20873,6 +26273,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -20903,6 +26309,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20922,6 +26334,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20941,6 +26359,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20958,6 +26382,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20977,6 +26407,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -21000,6 +26436,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -21022,6 +26464,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -21047,6 +26495,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -21071,7 +26525,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX1250-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -21104,6 +26564,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -21137,6 +26603,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -21174,6 +26646,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -21206,6 +26684,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -21236,6 +26720,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21255,6 +26745,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21274,6 +26770,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21291,6 +26793,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21310,6 +26818,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -21333,6 +26847,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -21355,6 +26875,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -21380,6 +26906,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -21404,7 +26936,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -21437,6 +26975,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -21470,6 +27014,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -21507,6 +27057,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -21539,6 +27095,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -21569,6 +27131,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21588,6 +27156,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21607,6 +27181,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21624,6 +27204,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21643,6 +27229,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -21666,6 +27258,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -21688,6 +27286,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -21713,6 +27317,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -21737,7 +27347,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -21770,6 +27386,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -21803,6 +27425,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -21840,6 +27468,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -21871,6 +27505,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -21901,6 +27541,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21920,6 +27566,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21939,6 +27591,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21956,6 +27614,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21975,6 +27639,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -21998,6 +27668,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -22019,6 +27695,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -22044,6 +27726,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -22067,7 +27755,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -22100,6 +27794,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -22133,6 +27833,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -22170,6 +27876,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -22201,6 +27913,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -22231,6 +27949,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -22250,6 +27974,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -22269,6 +27999,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -22286,6 +28022,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -22305,6 +28047,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -22328,6 +28076,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -22349,6 +28103,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -22374,6 +28134,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -22397,7 +28163,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -22430,6 +28202,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -22463,6 +28241,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -22500,6 +28284,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -22532,6 +28322,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -22562,6 +28358,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -22581,6 +28383,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -22600,6 +28408,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -22617,6 +28431,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -22636,6 +28456,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -22659,6 +28485,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -22681,6 +28513,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -22706,6 +28544,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -22730,7 +28574,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -22763,6 +28613,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -22796,6 +28652,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -22833,6 +28695,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -22865,6 +28733,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -22895,6 +28769,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -22914,6 +28794,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -22933,6 +28819,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -22950,6 +28842,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -22969,6 +28867,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -22992,6 +28896,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -23014,6 +28924,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -23039,6 +28955,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -23063,7 +28985,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -23096,6 +29024,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -23129,6 +29063,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 @@ -23166,6 +29106,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 @@ -23198,6 +29144,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 @@ -23228,6 +29180,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -23247,6 +29205,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -23266,6 +29230,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -23283,6 +29253,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -23302,6 +29278,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -23325,6 +29307,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -23347,6 +29335,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -23372,6 +29366,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -23396,7 +29396,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll index bd92242ddd22..736af2a4713c 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll @@ -18,6 +18,9 @@ define amdgpu_kernel void @global_agent_unordered_load( ; GFX6-LABEL: global_agent_unordered_load: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -45,12 +48,16 @@ define amdgpu_kernel void @global_agent_unordered_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -59,29 +66,38 @@ define amdgpu_kernel void @global_agent_unordered_load( ; ; GFX10-WGP-LABEL: global_agent_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_agent_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_agent_unordered_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -106,101 +122,131 @@ define amdgpu_kernel void @global_agent_unordered_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_unordered_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_agent_unordered_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_agent_unordered_load: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_agent_unordered_load: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_agent_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_agent_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_agent_unordered_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -214,6 +260,9 @@ define amdgpu_kernel void @global_agent_monotonic_load( ; GFX6-LABEL: global_agent_monotonic_load: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -241,12 +290,16 @@ define amdgpu_kernel void @global_agent_monotonic_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] glc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -255,29 +308,38 @@ define amdgpu_kernel void @global_agent_monotonic_load( ; ; GFX10-WGP-LABEL: global_agent_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] glc dlc -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_agent_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] glc dlc -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_agent_monotonic_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -302,101 +364,131 @@ define amdgpu_kernel void @global_agent_monotonic_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_monotonic_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_agent_monotonic_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc1 -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_agent_monotonic_load: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] glc -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_agent_monotonic_load: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] glc -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_agent_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_agent_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_agent_monotonic_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -410,6 +502,9 @@ define amdgpu_kernel void @global_agent_acquire_load( ; GFX6-LABEL: global_agent_acquire_load: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -438,14 +533,18 @@ define amdgpu_kernel void @global_agent_acquire_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: flat_store_dword v[0:1], v2 @@ -453,33 +552,44 @@ define amdgpu_kernel void @global_agent_acquire_load( ; ; GFX10-WGP-LABEL: global_agent_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_agent_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_agent_acquire_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -504,113 +614,149 @@ define amdgpu_kernel void @global_agent_acquire_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_acquire_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc1 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_agent_acquire_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_agent_acquire_load: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_agent_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_agent_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_agent_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_agent_acquire_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -624,6 +770,9 @@ define amdgpu_kernel void @global_agent_seq_cst_load( ; GFX6-LABEL: global_agent_seq_cst_load: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -653,9 +802,12 @@ define amdgpu_kernel void @global_agent_seq_cst_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -669,8 +821,12 @@ define amdgpu_kernel void @global_agent_seq_cst_load( ; ; GFX10-WGP-LABEL: global_agent_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -683,8 +839,12 @@ define amdgpu_kernel void @global_agent_seq_cst_load( ; ; GFX10-CU-LABEL: global_agent_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -698,6 +858,9 @@ define amdgpu_kernel void @global_agent_seq_cst_load( ; SKIP-CACHE-INV-LABEL: global_agent_seq_cst_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -723,8 +886,12 @@ define amdgpu_kernel void @global_agent_seq_cst_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc @@ -735,8 +902,12 @@ define amdgpu_kernel void @global_agent_seq_cst_load( ; ; GFX90A-TGSPLIT-LABEL: global_agent_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc @@ -747,8 +918,12 @@ define amdgpu_kernel void @global_agent_seq_cst_load( ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_seq_cst_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc1 @@ -759,8 +934,12 @@ define amdgpu_kernel void @global_agent_seq_cst_load( ; ; GFX942-TGSPLIT-LABEL: global_agent_seq_cst_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc1 @@ -771,8 +950,12 @@ define amdgpu_kernel void @global_agent_seq_cst_load( ; ; GFX11-WGP-LABEL: global_agent_seq_cst_load: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -785,8 +968,12 @@ define amdgpu_kernel void @global_agent_seq_cst_load( ; ; GFX11-CU-LABEL: global_agent_seq_cst_load: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -799,49 +986,61 @@ define amdgpu_kernel void @global_agent_seq_cst_load( ; ; GFX12-WGP-LABEL: global_agent_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_agent_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_agent_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -855,6 +1054,9 @@ define amdgpu_kernel void @global_agent_unordered_store( ; GFX6-LABEL: global_agent_unordered_store: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -876,6 +1078,10 @@ define amdgpu_kernel void @global_agent_unordered_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -886,27 +1092,38 @@ define amdgpu_kernel void @global_agent_unordered_store( ; ; GFX10-WGP-LABEL: global_agent_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_agent_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_agent_unordered_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -924,93 +1141,129 @@ define amdgpu_kernel void @global_agent_unordered_store( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_unordered_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_agent_unordered_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_agent_unordered_store: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_agent_unordered_store: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_agent_unordered_store: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_agent_unordered_store: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_agent_unordered_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -1023,6 +1276,9 @@ define amdgpu_kernel void @global_agent_monotonic_store( ; GFX6-LABEL: global_agent_monotonic_store: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -1044,6 +1300,10 @@ define amdgpu_kernel void @global_agent_monotonic_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -1054,27 +1314,38 @@ define amdgpu_kernel void @global_agent_monotonic_store( ; ; GFX10-WGP-LABEL: global_agent_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_agent_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_agent_monotonic_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -1092,93 +1363,129 @@ define amdgpu_kernel void @global_agent_monotonic_store( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_monotonic_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_agent_monotonic_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_agent_monotonic_store: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_agent_monotonic_store: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_agent_monotonic_store: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_agent_monotonic_store: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_agent_monotonic_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -1191,6 +1498,9 @@ define amdgpu_kernel void @global_agent_release_store( ; GFX6-LABEL: global_agent_release_store: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -1213,6 +1523,10 @@ define amdgpu_kernel void @global_agent_release_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -1224,10 +1538,13 @@ define amdgpu_kernel void @global_agent_release_store( ; ; GFX10-WGP-LABEL: global_agent_release_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1236,10 +1553,13 @@ define amdgpu_kernel void @global_agent_release_store( ; ; GFX10-CU-LABEL: global_agent_release_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1249,6 +1569,9 @@ define amdgpu_kernel void @global_agent_release_store( ; SKIP-CACHE-INV-LABEL: global_agent_release_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -1267,10 +1590,13 @@ define amdgpu_kernel void @global_agent_release_store( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -1278,10 +1604,13 @@ define amdgpu_kernel void @global_agent_release_store( ; ; GFX90A-TGSPLIT-LABEL: global_agent_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -1289,10 +1618,13 @@ define amdgpu_kernel void @global_agent_release_store( ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_release_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1301,10 +1633,13 @@ define amdgpu_kernel void @global_agent_release_store( ; ; GFX942-TGSPLIT-LABEL: global_agent_release_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1313,10 +1648,13 @@ define amdgpu_kernel void @global_agent_release_store( ; ; GFX11-WGP-LABEL: global_agent_release_store: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1325,10 +1663,13 @@ define amdgpu_kernel void @global_agent_release_store( ; ; GFX11-CU-LABEL: global_agent_release_store: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1337,39 +1678,50 @@ define amdgpu_kernel void @global_agent_release_store( ; ; GFX12-WGP-LABEL: global_agent_release_store: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_agent_release_store: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_agent_release_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -1377,6 +1729,7 @@ define amdgpu_kernel void @global_agent_release_store( ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -1389,6 +1742,9 @@ define amdgpu_kernel void @global_agent_seq_cst_store( ; GFX6-LABEL: global_agent_seq_cst_store: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -1411,6 +1767,10 @@ define amdgpu_kernel void @global_agent_seq_cst_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -1422,10 +1782,13 @@ define amdgpu_kernel void @global_agent_seq_cst_store( ; ; GFX10-WGP-LABEL: global_agent_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1434,10 +1797,13 @@ define amdgpu_kernel void @global_agent_seq_cst_store( ; ; GFX10-CU-LABEL: global_agent_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1447,6 +1813,9 @@ define amdgpu_kernel void @global_agent_seq_cst_store( ; SKIP-CACHE-INV-LABEL: global_agent_seq_cst_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -1465,10 +1834,13 @@ define amdgpu_kernel void @global_agent_seq_cst_store( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -1476,10 +1848,13 @@ define amdgpu_kernel void @global_agent_seq_cst_store( ; ; GFX90A-TGSPLIT-LABEL: global_agent_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -1487,10 +1862,13 @@ define amdgpu_kernel void @global_agent_seq_cst_store( ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_seq_cst_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1499,10 +1877,13 @@ define amdgpu_kernel void @global_agent_seq_cst_store( ; ; GFX942-TGSPLIT-LABEL: global_agent_seq_cst_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1511,10 +1892,13 @@ define amdgpu_kernel void @global_agent_seq_cst_store( ; ; GFX11-WGP-LABEL: global_agent_seq_cst_store: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1523,10 +1907,13 @@ define amdgpu_kernel void @global_agent_seq_cst_store( ; ; GFX11-CU-LABEL: global_agent_seq_cst_store: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1535,39 +1922,50 @@ define amdgpu_kernel void @global_agent_seq_cst_store( ; ; GFX12-WGP-LABEL: global_agent_seq_cst_store: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_agent_seq_cst_store: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_agent_seq_cst_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -1575,6 +1973,7 @@ define amdgpu_kernel void @global_agent_seq_cst_store( ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -1587,6 +1986,10 @@ define amdgpu_kernel void @global_agent_monotonic_atomicrmw( ; GFX6-LABEL: global_agent_monotonic_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -1606,18 +2009,26 @@ define amdgpu_kernel void @global_agent_monotonic_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_agent_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1627,7 +2038,11 @@ define amdgpu_kernel void @global_agent_monotonic_atomicrmw( ; ; GFX10-CU-LABEL: global_agent_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1638,6 +2053,10 @@ define amdgpu_kernel void @global_agent_monotonic_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_agent_monotonic_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -1654,7 +2073,11 @@ define amdgpu_kernel void @global_agent_monotonic_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1664,7 +2087,11 @@ define amdgpu_kernel void @global_agent_monotonic_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_agent_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1674,7 +2101,11 @@ define amdgpu_kernel void @global_agent_monotonic_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_monotonic_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1684,7 +2115,11 @@ define amdgpu_kernel void @global_agent_monotonic_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_agent_monotonic_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1694,7 +2129,11 @@ define amdgpu_kernel void @global_agent_monotonic_atomicrmw( ; ; GFX11-WGP-LABEL: global_agent_monotonic_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1704,7 +2143,11 @@ define amdgpu_kernel void @global_agent_monotonic_atomicrmw( ; ; GFX11-CU-LABEL: global_agent_monotonic_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1714,7 +2157,11 @@ define amdgpu_kernel void @global_agent_monotonic_atomicrmw( ; ; GFX12-WGP-LABEL: global_agent_monotonic_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -1724,7 +2171,11 @@ define amdgpu_kernel void @global_agent_monotonic_atomicrmw( ; ; GFX12-CU-LABEL: global_agent_monotonic_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -1735,7 +2186,11 @@ define amdgpu_kernel void @global_agent_monotonic_atomicrmw( ; GFX1250-LABEL: global_agent_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -1753,6 +2208,10 @@ define amdgpu_kernel void @global_agent_acquire_atomicrmw( ; GFX6-LABEL: global_agent_acquire_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -1774,11 +2233,15 @@ define amdgpu_kernel void @global_agent_acquire_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -1787,7 +2250,11 @@ define amdgpu_kernel void @global_agent_acquire_atomicrmw( ; ; GFX10-WGP-LABEL: global_agent_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1800,7 +2267,11 @@ define amdgpu_kernel void @global_agent_acquire_atomicrmw( ; ; GFX10-CU-LABEL: global_agent_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1814,6 +2285,10 @@ define amdgpu_kernel void @global_agent_acquire_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_agent_acquire_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -1831,7 +2306,11 @@ define amdgpu_kernel void @global_agent_acquire_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1843,7 +2322,11 @@ define amdgpu_kernel void @global_agent_acquire_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_agent_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1855,7 +2338,11 @@ define amdgpu_kernel void @global_agent_acquire_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_acquire_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1867,7 +2354,11 @@ define amdgpu_kernel void @global_agent_acquire_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_agent_acquire_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1879,7 +2370,11 @@ define amdgpu_kernel void @global_agent_acquire_atomicrmw( ; ; GFX11-WGP-LABEL: global_agent_acquire_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1892,7 +2387,11 @@ define amdgpu_kernel void @global_agent_acquire_atomicrmw( ; ; GFX11-CU-LABEL: global_agent_acquire_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1905,7 +2404,11 @@ define amdgpu_kernel void @global_agent_acquire_atomicrmw( ; ; GFX12-WGP-LABEL: global_agent_acquire_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -1917,7 +2420,11 @@ define amdgpu_kernel void @global_agent_acquire_atomicrmw( ; ; GFX12-CU-LABEL: global_agent_acquire_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -1930,7 +2437,11 @@ define amdgpu_kernel void @global_agent_acquire_atomicrmw( ; GFX1250-LABEL: global_agent_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -1951,6 +2462,10 @@ define amdgpu_kernel void @global_agent_release_atomicrmw( ; GFX6-LABEL: global_agent_release_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -1971,11 +2486,15 @@ define amdgpu_kernel void @global_agent_release_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 @@ -1983,7 +2502,11 @@ define amdgpu_kernel void @global_agent_release_atomicrmw( ; ; GFX10-WGP-LABEL: global_agent_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1995,7 +2518,11 @@ define amdgpu_kernel void @global_agent_release_atomicrmw( ; ; GFX10-CU-LABEL: global_agent_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2008,6 +2535,10 @@ define amdgpu_kernel void @global_agent_release_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_agent_release_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -2025,7 +2556,11 @@ define amdgpu_kernel void @global_agent_release_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2036,7 +2571,11 @@ define amdgpu_kernel void @global_agent_release_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_agent_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2047,7 +2586,11 @@ define amdgpu_kernel void @global_agent_release_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_release_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2059,7 +2602,11 @@ define amdgpu_kernel void @global_agent_release_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_agent_release_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2071,7 +2618,11 @@ define amdgpu_kernel void @global_agent_release_atomicrmw( ; ; GFX11-WGP-LABEL: global_agent_release_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2083,7 +2634,11 @@ define amdgpu_kernel void @global_agent_release_atomicrmw( ; ; GFX11-CU-LABEL: global_agent_release_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2095,7 +2650,11 @@ define amdgpu_kernel void @global_agent_release_atomicrmw( ; ; GFX12-WGP-LABEL: global_agent_release_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -2109,7 +2668,11 @@ define amdgpu_kernel void @global_agent_release_atomicrmw( ; ; GFX12-CU-LABEL: global_agent_release_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -2124,7 +2687,11 @@ define amdgpu_kernel void @global_agent_release_atomicrmw( ; GFX1250-LABEL: global_agent_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2147,6 +2714,10 @@ define amdgpu_kernel void @global_agent_acq_rel_atomicrmw( ; GFX6-LABEL: global_agent_acq_rel_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -2169,11 +2740,15 @@ define amdgpu_kernel void @global_agent_acq_rel_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 @@ -2183,7 +2758,11 @@ define amdgpu_kernel void @global_agent_acq_rel_atomicrmw( ; ; GFX10-WGP-LABEL: global_agent_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2198,7 +2777,11 @@ define amdgpu_kernel void @global_agent_acq_rel_atomicrmw( ; ; GFX10-CU-LABEL: global_agent_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2214,6 +2797,10 @@ define amdgpu_kernel void @global_agent_acq_rel_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_agent_acq_rel_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -2232,7 +2819,11 @@ define amdgpu_kernel void @global_agent_acq_rel_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2245,7 +2836,11 @@ define amdgpu_kernel void @global_agent_acq_rel_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_agent_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2258,7 +2853,11 @@ define amdgpu_kernel void @global_agent_acq_rel_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_acq_rel_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2272,7 +2871,11 @@ define amdgpu_kernel void @global_agent_acq_rel_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_agent_acq_rel_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2286,7 +2889,11 @@ define amdgpu_kernel void @global_agent_acq_rel_atomicrmw( ; ; GFX11-WGP-LABEL: global_agent_acq_rel_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2301,7 +2908,11 @@ define amdgpu_kernel void @global_agent_acq_rel_atomicrmw( ; ; GFX11-CU-LABEL: global_agent_acq_rel_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2316,7 +2927,11 @@ define amdgpu_kernel void @global_agent_acq_rel_atomicrmw( ; ; GFX12-WGP-LABEL: global_agent_acq_rel_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -2332,7 +2947,11 @@ define amdgpu_kernel void @global_agent_acq_rel_atomicrmw( ; ; GFX12-CU-LABEL: global_agent_acq_rel_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -2349,7 +2968,11 @@ define amdgpu_kernel void @global_agent_acq_rel_atomicrmw( ; GFX1250-LABEL: global_agent_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2375,6 +2998,10 @@ define amdgpu_kernel void @global_agent_seq_cst_atomicrmw( ; GFX6-LABEL: global_agent_seq_cst_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -2397,11 +3024,15 @@ define amdgpu_kernel void @global_agent_seq_cst_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 @@ -2411,7 +3042,11 @@ define amdgpu_kernel void @global_agent_seq_cst_atomicrmw( ; ; GFX10-WGP-LABEL: global_agent_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2426,7 +3061,11 @@ define amdgpu_kernel void @global_agent_seq_cst_atomicrmw( ; ; GFX10-CU-LABEL: global_agent_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2442,6 +3081,10 @@ define amdgpu_kernel void @global_agent_seq_cst_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_agent_seq_cst_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -2460,7 +3103,11 @@ define amdgpu_kernel void @global_agent_seq_cst_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2473,7 +3120,11 @@ define amdgpu_kernel void @global_agent_seq_cst_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_agent_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2486,7 +3137,11 @@ define amdgpu_kernel void @global_agent_seq_cst_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_seq_cst_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2500,7 +3155,11 @@ define amdgpu_kernel void @global_agent_seq_cst_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_agent_seq_cst_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2514,7 +3173,11 @@ define amdgpu_kernel void @global_agent_seq_cst_atomicrmw( ; ; GFX11-WGP-LABEL: global_agent_seq_cst_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2529,7 +3192,11 @@ define amdgpu_kernel void @global_agent_seq_cst_atomicrmw( ; ; GFX11-CU-LABEL: global_agent_seq_cst_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2544,7 +3211,11 @@ define amdgpu_kernel void @global_agent_seq_cst_atomicrmw( ; ; GFX12-WGP-LABEL: global_agent_seq_cst_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -2560,7 +3231,11 @@ define amdgpu_kernel void @global_agent_seq_cst_atomicrmw( ; ; GFX12-CU-LABEL: global_agent_seq_cst_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -2577,7 +3252,11 @@ define amdgpu_kernel void @global_agent_seq_cst_atomicrmw( ; GFX1250-LABEL: global_agent_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2603,6 +3282,10 @@ define amdgpu_kernel void @global_agent_acquire_ret_atomicrmw( ; GFX6-LABEL: global_agent_acquire_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -2626,6 +3309,10 @@ define amdgpu_kernel void @global_agent_acquire_ret_atomicrmw( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -2641,7 +3328,11 @@ define amdgpu_kernel void @global_agent_acquire_ret_atomicrmw( ; ; GFX10-WGP-LABEL: global_agent_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2655,7 +3346,11 @@ define amdgpu_kernel void @global_agent_acquire_ret_atomicrmw( ; ; GFX10-CU-LABEL: global_agent_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2670,6 +3365,10 @@ define amdgpu_kernel void @global_agent_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_agent_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -2688,7 +3387,11 @@ define amdgpu_kernel void @global_agent_acquire_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2701,7 +3404,11 @@ define amdgpu_kernel void @global_agent_acquire_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_agent_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2714,7 +3421,11 @@ define amdgpu_kernel void @global_agent_acquire_ret_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_acquire_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2727,7 +3438,11 @@ define amdgpu_kernel void @global_agent_acquire_ret_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_agent_acquire_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2740,7 +3455,11 @@ define amdgpu_kernel void @global_agent_acquire_ret_atomicrmw( ; ; GFX11-WGP-LABEL: global_agent_acquire_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2754,7 +3473,11 @@ define amdgpu_kernel void @global_agent_acquire_ret_atomicrmw( ; ; GFX11-CU-LABEL: global_agent_acquire_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2768,7 +3491,11 @@ define amdgpu_kernel void @global_agent_acquire_ret_atomicrmw( ; ; GFX12-WGP-LABEL: global_agent_acquire_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -2781,7 +3508,11 @@ define amdgpu_kernel void @global_agent_acquire_ret_atomicrmw( ; ; GFX12-CU-LABEL: global_agent_acquire_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -2795,7 +3526,11 @@ define amdgpu_kernel void @global_agent_acquire_ret_atomicrmw( ; GFX1250-LABEL: global_agent_acquire_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2818,6 +3553,10 @@ define amdgpu_kernel void @global_agent_acq_rel_ret_atomicrmw( ; GFX6-LABEL: global_agent_acq_rel_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -2842,6 +3581,10 @@ define amdgpu_kernel void @global_agent_acq_rel_ret_atomicrmw( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -2858,7 +3601,11 @@ define amdgpu_kernel void @global_agent_acq_rel_ret_atomicrmw( ; ; GFX10-WGP-LABEL: global_agent_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2874,7 +3621,11 @@ define amdgpu_kernel void @global_agent_acq_rel_ret_atomicrmw( ; ; GFX10-CU-LABEL: global_agent_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2891,6 +3642,10 @@ define amdgpu_kernel void @global_agent_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_agent_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -2910,7 +3665,11 @@ define amdgpu_kernel void @global_agent_acq_rel_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2924,7 +3683,11 @@ define amdgpu_kernel void @global_agent_acq_rel_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_agent_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2938,7 +3701,11 @@ define amdgpu_kernel void @global_agent_acq_rel_ret_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_acq_rel_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2953,7 +3720,11 @@ define amdgpu_kernel void @global_agent_acq_rel_ret_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_agent_acq_rel_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2968,7 +3739,11 @@ define amdgpu_kernel void @global_agent_acq_rel_ret_atomicrmw( ; ; GFX11-WGP-LABEL: global_agent_acq_rel_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2984,7 +3759,11 @@ define amdgpu_kernel void @global_agent_acq_rel_ret_atomicrmw( ; ; GFX11-CU-LABEL: global_agent_acq_rel_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -3000,7 +3779,11 @@ define amdgpu_kernel void @global_agent_acq_rel_ret_atomicrmw( ; ; GFX12-WGP-LABEL: global_agent_acq_rel_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -3017,7 +3800,11 @@ define amdgpu_kernel void @global_agent_acq_rel_ret_atomicrmw( ; ; GFX12-CU-LABEL: global_agent_acq_rel_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -3035,7 +3822,11 @@ define amdgpu_kernel void @global_agent_acq_rel_ret_atomicrmw( ; GFX1250-LABEL: global_agent_acq_rel_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -3063,6 +3854,10 @@ define amdgpu_kernel void @global_agent_seq_cst_ret_atomicrmw( ; GFX6-LABEL: global_agent_seq_cst_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -3087,6 +3882,10 @@ define amdgpu_kernel void @global_agent_seq_cst_ret_atomicrmw( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -3103,7 +3902,11 @@ define amdgpu_kernel void @global_agent_seq_cst_ret_atomicrmw( ; ; GFX10-WGP-LABEL: global_agent_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -3119,7 +3922,11 @@ define amdgpu_kernel void @global_agent_seq_cst_ret_atomicrmw( ; ; GFX10-CU-LABEL: global_agent_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -3136,6 +3943,10 @@ define amdgpu_kernel void @global_agent_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_agent_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -3155,7 +3966,11 @@ define amdgpu_kernel void @global_agent_seq_cst_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3169,7 +3984,11 @@ define amdgpu_kernel void @global_agent_seq_cst_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_agent_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3183,7 +4002,11 @@ define amdgpu_kernel void @global_agent_seq_cst_ret_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_seq_cst_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3198,7 +4021,11 @@ define amdgpu_kernel void @global_agent_seq_cst_ret_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_agent_seq_cst_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3213,7 +4040,11 @@ define amdgpu_kernel void @global_agent_seq_cst_ret_atomicrmw( ; ; GFX11-WGP-LABEL: global_agent_seq_cst_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -3229,7 +4060,11 @@ define amdgpu_kernel void @global_agent_seq_cst_ret_atomicrmw( ; ; GFX11-CU-LABEL: global_agent_seq_cst_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -3245,7 +4080,11 @@ define amdgpu_kernel void @global_agent_seq_cst_ret_atomicrmw( ; ; GFX12-WGP-LABEL: global_agent_seq_cst_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -3262,7 +4101,11 @@ define amdgpu_kernel void @global_agent_seq_cst_ret_atomicrmw( ; ; GFX12-CU-LABEL: global_agent_seq_cst_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -3280,7 +4123,11 @@ define amdgpu_kernel void @global_agent_seq_cst_ret_atomicrmw( ; GFX1250-LABEL: global_agent_seq_cst_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -3309,6 +4156,12 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -3333,11 +4186,16 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -3347,6 +4205,7 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -3357,7 +4216,13 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3371,7 +4236,13 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3387,6 +4258,12 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -3407,7 +4284,13 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3421,7 +4304,13 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3435,7 +4324,13 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_monotonic_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -3449,7 +4344,13 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_agent_monotonic_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -3463,7 +4364,13 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3477,7 +4384,13 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3491,7 +4404,13 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3505,7 +4424,13 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3520,7 +4445,13 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_cmpxchg( ; GFX1250-LABEL: global_agent_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -3544,6 +4475,12 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -3570,11 +4507,16 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -3584,6 +4526,7 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -3596,7 +4539,13 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3613,7 +4562,13 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3632,6 +4587,12 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -3653,7 +4614,13 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3669,7 +4636,13 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3685,7 +4658,13 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_acquire_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -3701,7 +4680,13 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_agent_acquire_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -3717,7 +4702,13 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3734,7 +4725,13 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3751,7 +4748,13 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3767,7 +4770,13 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3784,7 +4793,13 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg( ; GFX1250-LABEL: global_agent_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -3811,6 +4826,12 @@ define amdgpu_kernel void @global_agent_release_monotonic_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -3836,11 +4857,16 @@ define amdgpu_kernel void @global_agent_release_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -3850,6 +4876,7 @@ define amdgpu_kernel void @global_agent_release_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -3861,7 +4888,13 @@ define amdgpu_kernel void @global_agent_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3877,7 +4910,13 @@ define amdgpu_kernel void @global_agent_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3895,6 +4934,12 @@ define amdgpu_kernel void @global_agent_release_monotonic_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -3916,7 +4961,13 @@ define amdgpu_kernel void @global_agent_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3931,7 +4982,13 @@ define amdgpu_kernel void @global_agent_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3946,7 +5003,13 @@ define amdgpu_kernel void @global_agent_release_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_release_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -3962,7 +5025,13 @@ define amdgpu_kernel void @global_agent_release_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_agent_release_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -3978,7 +5047,13 @@ define amdgpu_kernel void @global_agent_release_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3994,7 +5069,13 @@ define amdgpu_kernel void @global_agent_release_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4010,7 +5091,13 @@ define amdgpu_kernel void @global_agent_release_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4028,7 +5115,13 @@ define amdgpu_kernel void @global_agent_release_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4047,7 +5140,13 @@ define amdgpu_kernel void @global_agent_release_monotonic_cmpxchg( ; GFX1250-LABEL: global_agent_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -4076,6 +5175,12 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -4103,11 +5208,16 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -4117,6 +5227,7 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -4130,7 +5241,13 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4149,7 +5266,13 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4170,6 +5293,12 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -4192,7 +5321,13 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4209,7 +5344,13 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4226,7 +5367,13 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_acq_rel_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -4244,7 +5391,13 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_agent_acq_rel_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -4262,7 +5415,13 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4281,7 +5440,13 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4300,7 +5465,13 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4320,7 +5491,13 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4341,7 +5518,13 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg( ; GFX1250-LABEL: global_agent_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -4373,6 +5556,12 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -4400,11 +5589,16 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -4414,6 +5608,7 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -4427,7 +5622,13 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4446,7 +5647,13 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4467,6 +5674,12 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -4489,7 +5702,13 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4506,7 +5725,13 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4523,7 +5748,13 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_seq_cst_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -4541,7 +5772,13 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_agent_seq_cst_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -4559,7 +5796,13 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4578,7 +5821,13 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4597,7 +5846,13 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4617,7 +5872,13 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4638,7 +5899,13 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg( ; GFX1250-LABEL: global_agent_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -4670,6 +5937,12 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -4696,11 +5969,16 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -4710,6 +5988,7 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -4722,7 +6001,13 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4739,7 +6024,13 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4758,6 +6049,12 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -4779,7 +6076,13 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4795,7 +6098,13 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4811,7 +6120,13 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_monotonic_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -4827,7 +6142,13 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_agent_monotonic_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -4843,7 +6164,13 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4860,7 +6187,13 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4877,7 +6210,13 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4893,7 +6232,13 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4910,7 +6255,13 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg( ; GFX1250-LABEL: global_agent_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -4937,6 +6288,12 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -4963,11 +6320,16 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -4977,6 +6339,7 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -4989,7 +6352,13 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5006,7 +6375,13 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5025,6 +6400,12 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -5046,7 +6427,13 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5062,7 +6449,13 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5078,7 +6471,13 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_acquire_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -5094,7 +6493,13 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_agent_acquire_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -5110,7 +6515,13 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5127,7 +6538,13 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5144,7 +6561,13 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5160,7 +6583,13 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5177,7 +6606,13 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg( ; GFX1250-LABEL: global_agent_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -5204,6 +6639,12 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5231,11 +6672,16 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -5245,6 +6691,7 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -5258,7 +6705,13 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5277,7 +6730,13 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5298,6 +6757,12 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -5320,7 +6785,13 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5337,7 +6808,13 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5354,7 +6831,13 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_release_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -5372,7 +6855,13 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_agent_release_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -5390,7 +6879,13 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5409,7 +6904,13 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5428,7 +6929,13 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5448,7 +6955,13 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5469,7 +6982,13 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg( ; GFX1250-LABEL: global_agent_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -5501,6 +7020,12 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5528,11 +7053,16 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -5542,6 +7072,7 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -5555,7 +7086,13 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5574,7 +7111,13 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5595,6 +7138,12 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -5617,7 +7166,13 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5634,7 +7189,13 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5651,7 +7212,13 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_acq_rel_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -5669,7 +7236,13 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_agent_acq_rel_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -5687,7 +7260,13 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5706,7 +7285,13 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5725,7 +7310,13 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5745,7 +7336,13 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5766,7 +7363,13 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg( ; GFX1250-LABEL: global_agent_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -5798,6 +7401,12 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5825,11 +7434,16 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -5839,6 +7453,7 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -5852,7 +7467,13 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5871,7 +7492,13 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5892,6 +7519,12 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -5914,7 +7547,13 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5931,7 +7570,13 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5948,7 +7593,13 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_seq_cst_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -5966,7 +7617,13 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_agent_seq_cst_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -5984,7 +7641,13 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6003,7 +7666,13 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6022,7 +7691,13 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6042,7 +7717,13 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6063,7 +7744,13 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg( ; GFX1250-LABEL: global_agent_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -6095,6 +7782,12 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6122,11 +7815,16 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -6136,6 +7834,7 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -6149,7 +7848,13 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6168,7 +7873,13 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6189,6 +7900,12 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -6211,7 +7928,13 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6228,7 +7951,13 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6245,7 +7974,13 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_monotonic_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -6263,7 +7998,13 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_agent_monotonic_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -6281,7 +8022,13 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6300,7 +8047,13 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6319,7 +8072,13 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6339,7 +8098,13 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6360,7 +8125,13 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg( ; GFX1250-LABEL: global_agent_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -6392,6 +8163,12 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6419,11 +8196,16 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -6433,6 +8215,7 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -6446,7 +8229,13 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6465,7 +8254,13 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6486,6 +8281,12 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -6508,7 +8309,13 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6525,7 +8332,13 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6542,7 +8355,13 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_acquire_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -6560,7 +8379,13 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_agent_acquire_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -6578,7 +8403,13 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6597,7 +8428,13 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6616,7 +8453,13 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6636,7 +8479,13 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6657,7 +8506,13 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg( ; GFX1250-LABEL: global_agent_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -6689,6 +8544,12 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6716,11 +8577,16 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -6730,6 +8596,7 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -6743,7 +8610,13 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6762,7 +8635,13 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6783,6 +8662,12 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -6805,7 +8690,13 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6822,7 +8713,13 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6839,7 +8736,13 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_release_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -6857,7 +8760,13 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_agent_release_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -6875,7 +8784,13 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6894,7 +8809,13 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6913,7 +8834,13 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6933,7 +8860,13 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6954,7 +8887,13 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg( ; GFX1250-LABEL: global_agent_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -6986,6 +8925,12 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7013,11 +8958,16 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -7027,6 +8977,7 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -7040,7 +8991,13 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7059,7 +9016,13 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7080,6 +9043,12 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -7102,7 +9071,13 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7119,7 +9094,13 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7136,7 +9117,13 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_acq_rel_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -7154,7 +9141,13 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_agent_acq_rel_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -7172,7 +9165,13 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7191,7 +9190,13 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7210,7 +9215,13 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7230,7 +9241,13 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7251,7 +9268,13 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg( ; GFX1250-LABEL: global_agent_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -7283,6 +9306,12 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7310,11 +9339,16 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -7324,6 +9358,7 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -7337,7 +9372,13 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7356,7 +9397,13 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7377,6 +9424,12 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -7399,7 +9452,13 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7416,7 +9475,13 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7433,7 +9498,13 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_seq_cst_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -7451,7 +9522,13 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_agent_seq_cst_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -7469,7 +9546,13 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7488,7 +9571,13 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7507,7 +9596,13 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7527,7 +9622,13 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7548,7 +9649,13 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg( ; GFX1250-LABEL: global_agent_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -7580,6 +9687,12 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7608,6 +9721,12 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -7635,7 +9754,13 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7651,7 +9776,13 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7669,6 +9800,12 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -7692,7 +9829,13 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7708,7 +9851,13 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7724,7 +9873,13 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -7740,7 +9895,13 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -7756,7 +9917,13 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7772,7 +9939,13 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7788,7 +9961,13 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7804,7 +9983,13 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7821,7 +10006,13 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_ret_cmpxchg( ; GFX1250-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -7849,6 +10040,12 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7878,6 +10075,12 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -7906,7 +10109,13 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7924,7 +10133,13 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7944,6 +10159,12 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -7967,7 +10188,13 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7984,7 +10211,13 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8001,7 +10234,13 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_acquire_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -8018,7 +10257,13 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_agent_acquire_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -8035,7 +10280,13 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8053,7 +10304,13 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8071,7 +10328,13 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8088,7 +10351,13 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8106,7 +10375,13 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg( ; GFX1250-LABEL: global_agent_acquire_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -8136,6 +10411,12 @@ define amdgpu_kernel void @global_agent_release_monotonic_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -8165,6 +10446,12 @@ define amdgpu_kernel void @global_agent_release_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -8193,7 +10480,13 @@ define amdgpu_kernel void @global_agent_release_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8211,7 +10504,13 @@ define amdgpu_kernel void @global_agent_release_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8231,6 +10530,12 @@ define amdgpu_kernel void @global_agent_release_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -8255,7 +10560,13 @@ define amdgpu_kernel void @global_agent_release_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8272,7 +10583,13 @@ define amdgpu_kernel void @global_agent_release_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8289,7 +10606,13 @@ define amdgpu_kernel void @global_agent_release_monotonic_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_release_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -8307,7 +10630,13 @@ define amdgpu_kernel void @global_agent_release_monotonic_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_agent_release_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -8325,7 +10654,13 @@ define amdgpu_kernel void @global_agent_release_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_release_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8343,7 +10678,13 @@ define amdgpu_kernel void @global_agent_release_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_release_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8361,7 +10702,13 @@ define amdgpu_kernel void @global_agent_release_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8381,7 +10728,13 @@ define amdgpu_kernel void @global_agent_release_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8402,7 +10755,13 @@ define amdgpu_kernel void @global_agent_release_monotonic_ret_cmpxchg( ; GFX1250-LABEL: global_agent_release_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -8435,6 +10794,12 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -8465,6 +10830,12 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -8494,7 +10865,13 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8514,7 +10891,13 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8536,6 +10919,12 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -8560,7 +10949,13 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8578,7 +10973,13 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8596,7 +10997,13 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -8615,7 +11022,13 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -8634,7 +11047,13 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8654,7 +11073,13 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8674,7 +11099,13 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8695,7 +11126,13 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8717,7 +11154,13 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX1250-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -8752,6 +11195,12 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -8782,6 +11231,12 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -8811,7 +11266,13 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8831,7 +11292,13 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8853,6 +11320,12 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -8877,7 +11350,13 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8895,7 +11374,13 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8913,7 +11398,13 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -8932,7 +11423,13 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -8951,7 +11448,13 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8971,7 +11474,13 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8991,7 +11500,13 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9012,7 +11527,13 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9034,7 +11555,13 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX1250-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -9069,6 +11596,12 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -9098,6 +11631,12 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -9126,7 +11665,13 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9144,7 +11689,13 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9164,6 +11715,12 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -9187,7 +11744,13 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9204,7 +11767,13 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9221,7 +11790,13 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_monotonic_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -9238,7 +11813,13 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_agent_monotonic_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -9255,7 +11836,13 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9273,7 +11860,13 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9291,7 +11884,13 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9308,7 +11907,13 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9326,7 +11931,13 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg( ; GFX1250-LABEL: global_agent_monotonic_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -9356,6 +11967,12 @@ define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -9385,6 +12002,12 @@ define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -9413,7 +12036,13 @@ define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9431,7 +12060,13 @@ define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9451,6 +12086,12 @@ define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -9474,7 +12115,13 @@ define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9491,7 +12138,13 @@ define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9508,7 +12161,13 @@ define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_acquire_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -9525,7 +12184,13 @@ define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_agent_acquire_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -9542,7 +12207,13 @@ define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9560,7 +12231,13 @@ define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9578,7 +12255,13 @@ define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9595,7 +12278,13 @@ define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9613,7 +12302,13 @@ define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg( ; GFX1250-LABEL: global_agent_acquire_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -9643,6 +12338,12 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -9673,6 +12374,12 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -9702,7 +12409,13 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9722,7 +12435,13 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9744,6 +12463,12 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -9768,7 +12493,13 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9786,7 +12517,13 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9804,7 +12541,13 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_release_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -9823,7 +12566,13 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_agent_release_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -9842,7 +12591,13 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9862,7 +12617,13 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9882,7 +12643,13 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9903,7 +12670,13 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9925,7 +12698,13 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg( ; GFX1250-LABEL: global_agent_release_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -9960,6 +12739,12 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -9990,6 +12775,12 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -10019,7 +12810,13 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10039,7 +12836,13 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10061,6 +12864,12 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -10085,7 +12894,13 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10103,7 +12918,13 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10121,7 +12942,13 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -10140,7 +12967,13 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -10159,7 +12992,13 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10179,7 +13018,13 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10199,7 +13044,13 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10220,7 +13071,13 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10242,7 +13099,13 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg( ; GFX1250-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -10277,6 +13140,12 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -10307,6 +13176,12 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -10336,7 +13211,13 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10356,7 +13237,13 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10378,6 +13265,12 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -10402,7 +13295,13 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10420,7 +13319,13 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10438,7 +13343,13 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -10457,7 +13368,13 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -10476,7 +13393,13 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10496,7 +13419,13 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10516,7 +13445,13 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10537,7 +13472,13 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10559,7 +13500,13 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg( ; GFX1250-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -10594,6 +13541,12 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -10624,6 +13577,12 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -10653,7 +13612,13 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10673,7 +13638,13 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10695,6 +13666,12 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -10719,7 +13696,13 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10737,7 +13720,13 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10755,7 +13744,13 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -10774,7 +13769,13 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -10793,7 +13794,13 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10813,7 +13820,13 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10833,7 +13846,13 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10854,7 +13873,13 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10876,7 +13901,13 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -10911,6 +13942,12 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -10941,6 +13978,12 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -10970,7 +14013,13 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10990,7 +14039,13 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -11012,6 +14067,12 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -11036,7 +14097,13 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -11054,7 +14121,13 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -11072,7 +14145,13 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -11091,7 +14170,13 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -11110,7 +14195,13 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -11130,7 +14221,13 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -11150,7 +14247,13 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -11171,7 +14274,13 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -11193,7 +14302,13 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -11228,6 +14343,12 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -11258,6 +14379,12 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -11287,7 +14414,13 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -11307,7 +14440,13 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -11329,6 +14468,12 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -11353,7 +14498,13 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -11371,7 +14522,13 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -11389,7 +14546,13 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_release_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -11408,7 +14571,13 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_agent_release_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -11427,7 +14596,13 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -11447,7 +14622,13 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -11467,7 +14648,13 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -11488,7 +14675,13 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -11510,7 +14703,13 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: global_agent_release_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -11545,6 +14744,12 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -11575,6 +14780,12 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -11604,7 +14815,13 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -11624,7 +14841,13 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -11646,6 +14869,12 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -11670,7 +14899,13 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -11688,7 +14923,13 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -11706,7 +14947,13 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -11725,7 +14972,13 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -11744,7 +14997,13 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -11764,7 +15023,13 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -11784,7 +15049,13 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -11805,7 +15076,13 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -11827,7 +15104,13 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -11862,6 +15145,12 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -11892,6 +15181,12 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -11921,7 +15216,13 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -11941,7 +15242,13 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -11963,6 +15270,12 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -11987,7 +15300,13 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -12005,7 +15324,13 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -12023,7 +15348,13 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -12042,7 +15373,13 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -12061,7 +15398,13 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -12081,7 +15424,13 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -12101,7 +15450,13 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -12122,7 +15477,13 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -12144,7 +15505,13 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -12178,6 +15545,9 @@ define amdgpu_kernel void @global_agent_one_as_unordered_load( ; GFX6-LABEL: global_agent_one_as_unordered_load: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -12205,12 +15575,16 @@ define amdgpu_kernel void @global_agent_one_as_unordered_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -12219,29 +15593,38 @@ define amdgpu_kernel void @global_agent_one_as_unordered_load( ; ; GFX10-WGP-LABEL: global_agent_one_as_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_agent_one_as_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_unordered_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -12266,101 +15649,131 @@ define amdgpu_kernel void @global_agent_one_as_unordered_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_one_as_unordered_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_agent_one_as_unordered_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_agent_one_as_unordered_load: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_agent_one_as_unordered_load: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_agent_one_as_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_agent_one_as_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_agent_one_as_unordered_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -12374,6 +15787,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_load( ; GFX6-LABEL: global_agent_one_as_monotonic_load: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -12401,12 +15817,16 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] glc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -12415,29 +15835,38 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_load( ; ; GFX10-WGP-LABEL: global_agent_one_as_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] glc dlc -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_agent_one_as_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] glc dlc -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_monotonic_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -12462,101 +15891,131 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_agent_one_as_monotonic_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc1 -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_agent_one_as_monotonic_load: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] glc -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_agent_one_as_monotonic_load: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] glc -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_agent_one_as_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_agent_one_as_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_agent_one_as_monotonic_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -12570,6 +16029,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_load( ; GFX6-LABEL: global_agent_one_as_acquire_load: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -12598,14 +16060,18 @@ define amdgpu_kernel void @global_agent_one_as_acquire_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: flat_store_dword v[0:1], v2 @@ -12613,33 +16079,44 @@ define amdgpu_kernel void @global_agent_one_as_acquire_load( ; ; GFX10-WGP-LABEL: global_agent_one_as_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_agent_one_as_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acquire_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -12664,113 +16141,149 @@ define amdgpu_kernel void @global_agent_one_as_acquire_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc1 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_agent_one_as_acquire_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_agent_one_as_acquire_load: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_agent_one_as_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_agent_one_as_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_agent_one_as_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_agent_one_as_acquire_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -12784,6 +16297,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_load( ; GFX6-LABEL: global_agent_one_as_seq_cst_load: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -12813,15 +16329,19 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_load_dword v2, v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: flat_store_dword v[0:1], v2 @@ -12829,35 +16349,48 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_load( ; ; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_agent_one_as_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_seq_cst_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -12883,125 +16416,167 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc1 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_agent_one_as_seq_cst_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_agent_one_as_seq_cst_load: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_agent_one_as_seq_cst_load: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_agent_one_as_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_agent_one_as_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_agent_one_as_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -13015,6 +16590,9 @@ define amdgpu_kernel void @global_agent_one_as_unordered_store( ; GFX6-LABEL: global_agent_one_as_unordered_store: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -13036,6 +16614,10 @@ define amdgpu_kernel void @global_agent_one_as_unordered_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -13046,27 +16628,38 @@ define amdgpu_kernel void @global_agent_one_as_unordered_store( ; ; GFX10-WGP-LABEL: global_agent_one_as_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_agent_one_as_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_unordered_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -13084,93 +16677,129 @@ define amdgpu_kernel void @global_agent_one_as_unordered_store( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_one_as_unordered_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_agent_one_as_unordered_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_agent_one_as_unordered_store: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_agent_one_as_unordered_store: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_agent_one_as_unordered_store: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_agent_one_as_unordered_store: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_agent_one_as_unordered_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -13183,6 +16812,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_store( ; GFX6-LABEL: global_agent_one_as_monotonic_store: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -13204,6 +16836,10 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -13214,27 +16850,38 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_store( ; ; GFX10-WGP-LABEL: global_agent_one_as_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_agent_one_as_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_monotonic_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -13252,93 +16899,129 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_store( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_agent_one_as_monotonic_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_agent_one_as_monotonic_store: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_agent_one_as_monotonic_store: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_agent_one_as_monotonic_store: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_agent_one_as_monotonic_store: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_agent_one_as_monotonic_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -13351,6 +17034,9 @@ define amdgpu_kernel void @global_agent_one_as_release_store( ; GFX6-LABEL: global_agent_one_as_release_store: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -13373,6 +17059,10 @@ define amdgpu_kernel void @global_agent_one_as_release_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -13384,24 +17074,30 @@ define amdgpu_kernel void @global_agent_one_as_release_store( ; ; GFX10-WGP-LABEL: global_agent_one_as_release_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_agent_one_as_release_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm @@ -13409,6 +17105,9 @@ define amdgpu_kernel void @global_agent_one_as_release_store( ; SKIP-CACHE-INV-LABEL: global_agent_one_as_release_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -13427,109 +17126,138 @@ define amdgpu_kernel void @global_agent_one_as_release_store( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_one_as_release_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_agent_one_as_release_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_agent_one_as_release_store: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_agent_one_as_release_store: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_agent_one_as_release_store: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_agent_one_as_release_store: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_agent_one_as_release_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -13537,6 +17265,7 @@ define amdgpu_kernel void @global_agent_one_as_release_store( ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -13549,6 +17278,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_store( ; GFX6-LABEL: global_agent_one_as_seq_cst_store: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -13571,6 +17303,10 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -13582,24 +17318,30 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_store( ; ; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_agent_one_as_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm @@ -13607,6 +17349,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_store( ; SKIP-CACHE-INV-LABEL: global_agent_one_as_seq_cst_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -13625,109 +17370,138 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_store( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_agent_one_as_seq_cst_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_agent_one_as_seq_cst_store: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_agent_one_as_seq_cst_store: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_agent_one_as_seq_cst_store: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_agent_one_as_seq_cst_store: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_agent_one_as_seq_cst_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -13735,6 +17509,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_store( ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -13747,6 +17522,10 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_atomicrmw( ; GFX6-LABEL: global_agent_one_as_monotonic_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -13766,18 +17545,26 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_agent_one_as_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -13787,7 +17574,11 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_atomicrmw( ; ; GFX10-CU-LABEL: global_agent_one_as_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -13798,6 +17589,10 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_agent_one_as_monotonic_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -13814,7 +17609,11 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13824,7 +17623,11 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13834,7 +17637,11 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13844,7 +17651,11 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_agent_one_as_monotonic_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13854,7 +17665,11 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_atomicrmw( ; ; GFX11-WGP-LABEL: global_agent_one_as_monotonic_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -13864,7 +17679,11 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_atomicrmw( ; ; GFX11-CU-LABEL: global_agent_one_as_monotonic_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -13874,7 +17693,11 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_atomicrmw( ; ; GFX12-WGP-LABEL: global_agent_one_as_monotonic_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -13884,7 +17707,11 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_atomicrmw( ; ; GFX12-CU-LABEL: global_agent_one_as_monotonic_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -13895,7 +17722,11 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_atomicrmw( ; GFX1250-LABEL: global_agent_one_as_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -13913,6 +17744,10 @@ define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw( ; GFX6-LABEL: global_agent_one_as_acquire_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -13934,11 +17769,15 @@ define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -13947,7 +17786,11 @@ define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw( ; ; GFX10-WGP-LABEL: global_agent_one_as_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -13960,7 +17803,11 @@ define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw( ; ; GFX10-CU-LABEL: global_agent_one_as_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -13974,6 +17821,10 @@ define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acquire_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -13991,7 +17842,11 @@ define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14003,7 +17858,11 @@ define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14015,7 +17874,11 @@ define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14027,7 +17890,11 @@ define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_agent_one_as_acquire_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14039,7 +17906,11 @@ define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw( ; ; GFX11-WGP-LABEL: global_agent_one_as_acquire_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -14052,7 +17923,11 @@ define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw( ; ; GFX11-CU-LABEL: global_agent_one_as_acquire_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -14065,7 +17940,11 @@ define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw( ; ; GFX12-WGP-LABEL: global_agent_one_as_acquire_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -14077,7 +17956,11 @@ define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw( ; ; GFX12-CU-LABEL: global_agent_one_as_acquire_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -14090,7 +17973,11 @@ define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw( ; GFX1250-LABEL: global_agent_one_as_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -14111,6 +17998,10 @@ define amdgpu_kernel void @global_agent_one_as_release_atomicrmw( ; GFX6-LABEL: global_agent_one_as_release_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -14131,11 +18022,15 @@ define amdgpu_kernel void @global_agent_one_as_release_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 @@ -14143,7 +18038,11 @@ define amdgpu_kernel void @global_agent_one_as_release_atomicrmw( ; ; GFX10-WGP-LABEL: global_agent_one_as_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -14155,7 +18054,11 @@ define amdgpu_kernel void @global_agent_one_as_release_atomicrmw( ; ; GFX10-CU-LABEL: global_agent_one_as_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -14168,6 +18071,10 @@ define amdgpu_kernel void @global_agent_one_as_release_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_agent_one_as_release_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -14185,7 +18092,11 @@ define amdgpu_kernel void @global_agent_one_as_release_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14196,7 +18107,11 @@ define amdgpu_kernel void @global_agent_one_as_release_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14207,7 +18122,11 @@ define amdgpu_kernel void @global_agent_one_as_release_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_one_as_release_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14219,7 +18138,11 @@ define amdgpu_kernel void @global_agent_one_as_release_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_agent_one_as_release_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14231,7 +18154,11 @@ define amdgpu_kernel void @global_agent_one_as_release_atomicrmw( ; ; GFX11-WGP-LABEL: global_agent_one_as_release_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -14243,7 +18170,11 @@ define amdgpu_kernel void @global_agent_one_as_release_atomicrmw( ; ; GFX11-CU-LABEL: global_agent_one_as_release_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -14255,7 +18186,11 @@ define amdgpu_kernel void @global_agent_one_as_release_atomicrmw( ; ; GFX12-WGP-LABEL: global_agent_one_as_release_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -14269,7 +18204,11 @@ define amdgpu_kernel void @global_agent_one_as_release_atomicrmw( ; ; GFX12-CU-LABEL: global_agent_one_as_release_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -14284,7 +18223,11 @@ define amdgpu_kernel void @global_agent_one_as_release_atomicrmw( ; GFX1250-LABEL: global_agent_one_as_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -14307,6 +18250,10 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw( ; GFX6-LABEL: global_agent_one_as_acq_rel_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -14329,11 +18276,15 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 @@ -14343,7 +18294,11 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw( ; ; GFX10-WGP-LABEL: global_agent_one_as_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -14358,7 +18313,11 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw( ; ; GFX10-CU-LABEL: global_agent_one_as_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -14374,6 +18333,10 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acq_rel_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -14392,7 +18355,11 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14405,7 +18372,11 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14418,7 +18389,11 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14432,7 +18407,11 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_agent_one_as_acq_rel_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14446,7 +18425,11 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw( ; ; GFX11-WGP-LABEL: global_agent_one_as_acq_rel_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -14461,7 +18444,11 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw( ; ; GFX11-CU-LABEL: global_agent_one_as_acq_rel_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -14476,7 +18463,11 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw( ; ; GFX12-WGP-LABEL: global_agent_one_as_acq_rel_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -14492,7 +18483,11 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw( ; ; GFX12-CU-LABEL: global_agent_one_as_acq_rel_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -14509,7 +18504,11 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw( ; GFX1250-LABEL: global_agent_one_as_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -14535,6 +18534,10 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw( ; GFX6-LABEL: global_agent_one_as_seq_cst_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -14557,11 +18560,15 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 @@ -14571,7 +18578,11 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw( ; ; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -14586,7 +18597,11 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw( ; ; GFX10-CU-LABEL: global_agent_one_as_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -14602,6 +18617,10 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_agent_one_as_seq_cst_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -14620,7 +18639,11 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14633,7 +18656,11 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14646,7 +18673,11 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14660,7 +18691,11 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_agent_one_as_seq_cst_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14674,7 +18709,11 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw( ; ; GFX11-WGP-LABEL: global_agent_one_as_seq_cst_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -14689,7 +18728,11 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw( ; ; GFX11-CU-LABEL: global_agent_one_as_seq_cst_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -14704,7 +18747,11 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw( ; ; GFX12-WGP-LABEL: global_agent_one_as_seq_cst_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -14720,7 +18767,11 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw( ; ; GFX12-CU-LABEL: global_agent_one_as_seq_cst_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -14737,7 +18788,11 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw( ; GFX1250-LABEL: global_agent_one_as_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -14763,6 +18818,10 @@ define amdgpu_kernel void @global_agent_one_as_acquire_ret_atomicrmw( ; GFX6-LABEL: global_agent_one_as_acquire_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -14786,6 +18845,10 @@ define amdgpu_kernel void @global_agent_one_as_acquire_ret_atomicrmw( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -14801,7 +18864,11 @@ define amdgpu_kernel void @global_agent_one_as_acquire_ret_atomicrmw( ; ; GFX10-WGP-LABEL: global_agent_one_as_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -14815,7 +18882,11 @@ define amdgpu_kernel void @global_agent_one_as_acquire_ret_atomicrmw( ; ; GFX10-CU-LABEL: global_agent_one_as_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -14830,6 +18901,10 @@ define amdgpu_kernel void @global_agent_one_as_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -14848,7 +18923,11 @@ define amdgpu_kernel void @global_agent_one_as_acquire_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14861,7 +18940,11 @@ define amdgpu_kernel void @global_agent_one_as_acquire_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14874,7 +18957,11 @@ define amdgpu_kernel void @global_agent_one_as_acquire_ret_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14887,7 +18974,11 @@ define amdgpu_kernel void @global_agent_one_as_acquire_ret_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_agent_one_as_acquire_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14900,7 +18991,11 @@ define amdgpu_kernel void @global_agent_one_as_acquire_ret_atomicrmw( ; ; GFX11-WGP-LABEL: global_agent_one_as_acquire_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -14914,7 +19009,11 @@ define amdgpu_kernel void @global_agent_one_as_acquire_ret_atomicrmw( ; ; GFX11-CU-LABEL: global_agent_one_as_acquire_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -14928,7 +19027,11 @@ define amdgpu_kernel void @global_agent_one_as_acquire_ret_atomicrmw( ; ; GFX12-WGP-LABEL: global_agent_one_as_acquire_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -14941,7 +19044,11 @@ define amdgpu_kernel void @global_agent_one_as_acquire_ret_atomicrmw( ; ; GFX12-CU-LABEL: global_agent_one_as_acquire_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -14955,7 +19062,11 @@ define amdgpu_kernel void @global_agent_one_as_acquire_ret_atomicrmw( ; GFX1250-LABEL: global_agent_one_as_acquire_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -14978,6 +19089,10 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw( ; GFX6-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -15002,6 +19117,10 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -15018,7 +19137,11 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw( ; ; GFX10-WGP-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -15034,7 +19157,11 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw( ; ; GFX10-CU-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -15051,6 +19178,10 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -15070,7 +19201,11 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15084,7 +19219,11 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15098,7 +19237,11 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15113,7 +19256,11 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15128,7 +19275,11 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw( ; ; GFX11-WGP-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -15144,7 +19295,11 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw( ; ; GFX11-CU-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -15160,7 +19315,11 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw( ; ; GFX12-WGP-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -15177,7 +19336,11 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw( ; ; GFX12-CU-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -15195,7 +19358,11 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw( ; GFX1250-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -15223,6 +19390,10 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw( ; GFX6-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -15247,6 +19418,10 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -15263,7 +19438,11 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw( ; ; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -15279,7 +19458,11 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw( ; ; GFX10-CU-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -15296,6 +19479,10 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -15315,7 +19502,11 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15329,7 +19520,11 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15343,7 +19538,11 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15358,7 +19557,11 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15373,7 +19576,11 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw( ; ; GFX11-WGP-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -15389,7 +19596,11 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw( ; ; GFX11-CU-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -15405,7 +19616,11 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw( ; ; GFX12-WGP-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -15422,7 +19637,11 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw( ; ; GFX12-CU-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -15440,7 +19659,11 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw( ; GFX1250-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -15469,6 +19692,12 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15493,11 +19722,16 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -15507,6 +19741,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -15517,7 +19752,13 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15531,7 +19772,13 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15547,6 +19794,12 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -15567,7 +19820,13 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15581,7 +19840,13 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15595,7 +19860,13 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -15609,7 +19880,13 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -15623,7 +19900,13 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15637,7 +19920,13 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15651,7 +19940,13 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15665,7 +19960,13 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15680,7 +19981,13 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_cmpxchg( ; GFX1250-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -15704,6 +20011,12 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15730,11 +20043,16 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -15744,6 +20062,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -15756,7 +20075,13 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15773,7 +20098,13 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15792,6 +20123,12 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -15813,7 +20150,13 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15829,7 +20172,13 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15845,7 +20194,13 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -15861,7 +20216,13 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -15877,7 +20238,13 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15894,7 +20261,13 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15911,7 +20284,13 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15927,7 +20306,13 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15944,7 +20329,13 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg( ; GFX1250-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -15971,6 +20362,12 @@ define amdgpu_kernel void @global_agent_one_as_release_monotonic_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15996,11 +20393,16 @@ define amdgpu_kernel void @global_agent_one_as_release_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -16010,6 +20412,7 @@ define amdgpu_kernel void @global_agent_one_as_release_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -16021,7 +20424,13 @@ define amdgpu_kernel void @global_agent_one_as_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_one_as_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16037,7 +20446,13 @@ define amdgpu_kernel void @global_agent_one_as_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_one_as_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16055,6 +20470,12 @@ define amdgpu_kernel void @global_agent_one_as_release_monotonic_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16076,7 +20497,13 @@ define amdgpu_kernel void @global_agent_one_as_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16091,7 +20518,13 @@ define amdgpu_kernel void @global_agent_one_as_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16106,7 +20539,13 @@ define amdgpu_kernel void @global_agent_one_as_release_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_one_as_release_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -16122,7 +20561,13 @@ define amdgpu_kernel void @global_agent_one_as_release_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_agent_one_as_release_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -16138,7 +20583,13 @@ define amdgpu_kernel void @global_agent_one_as_release_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_one_as_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16154,7 +20605,13 @@ define amdgpu_kernel void @global_agent_one_as_release_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_one_as_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16170,7 +20627,13 @@ define amdgpu_kernel void @global_agent_one_as_release_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_one_as_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16188,7 +20651,13 @@ define amdgpu_kernel void @global_agent_one_as_release_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_one_as_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16207,7 +20676,13 @@ define amdgpu_kernel void @global_agent_one_as_release_monotonic_cmpxchg( ; GFX1250-LABEL: global_agent_one_as_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -16236,6 +20711,12 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -16263,11 +20744,16 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -16277,6 +20763,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -16290,7 +20777,13 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16309,7 +20802,13 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16330,6 +20829,12 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16352,7 +20857,13 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16369,7 +20880,13 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16386,7 +20903,13 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -16404,7 +20927,13 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -16422,7 +20951,13 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16441,7 +20976,13 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16460,7 +21001,13 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16480,7 +21027,13 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16501,7 +21054,13 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX1250-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -16533,6 +21092,12 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -16560,11 +21125,16 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -16574,6 +21144,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -16587,7 +21158,13 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16606,7 +21183,13 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16627,6 +21210,12 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16649,7 +21238,13 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16666,7 +21261,13 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16683,7 +21284,13 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -16701,7 +21308,13 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -16719,7 +21332,13 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16738,7 +21357,13 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16757,7 +21382,13 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16777,7 +21408,13 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16798,7 +21435,13 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX1250-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -16830,6 +21473,12 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -16856,11 +21505,16 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -16870,6 +21524,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -16882,7 +21537,13 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16899,7 +21560,13 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16918,6 +21585,12 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16939,7 +21612,13 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16955,7 +21634,13 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16971,7 +21656,13 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -16987,7 +21678,13 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -17003,7 +21700,13 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17020,7 +21723,13 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17037,7 +21746,13 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17053,7 +21768,13 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17070,7 +21791,13 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg( ; GFX1250-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -17097,6 +21824,12 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -17123,11 +21856,16 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -17137,6 +21875,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -17149,7 +21888,13 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_one_as_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17166,7 +21911,13 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_one_as_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17185,6 +21936,12 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -17206,7 +21963,13 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17222,7 +21985,13 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17238,7 +22007,13 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -17254,7 +22029,13 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_agent_one_as_acquire_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -17270,7 +22051,13 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_one_as_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17287,7 +22074,13 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_one_as_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17304,7 +22097,13 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_one_as_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17320,7 +22119,13 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_one_as_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17337,7 +22142,13 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg( ; GFX1250-LABEL: global_agent_one_as_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -17364,6 +22175,12 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -17391,11 +22208,16 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -17405,6 +22227,7 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -17418,7 +22241,13 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_one_as_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17437,7 +22266,13 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_one_as_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17458,6 +22293,12 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -17480,7 +22321,13 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17497,7 +22344,13 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17514,7 +22367,13 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_one_as_release_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -17532,7 +22391,13 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_agent_one_as_release_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -17550,7 +22415,13 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_one_as_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17569,7 +22440,13 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_one_as_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17588,7 +22465,13 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_one_as_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17608,7 +22491,13 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_one_as_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17629,7 +22518,13 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg( ; GFX1250-LABEL: global_agent_one_as_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -17661,6 +22556,12 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -17688,11 +22589,16 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -17702,6 +22608,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -17715,7 +22622,13 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17734,7 +22647,13 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17755,6 +22674,12 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -17777,7 +22702,13 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17794,7 +22725,13 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17811,7 +22748,13 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -17829,7 +22772,13 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -17847,7 +22796,13 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17866,7 +22821,13 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17885,7 +22846,13 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17905,7 +22872,13 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17926,7 +22899,13 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX1250-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -17958,6 +22937,12 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -17985,11 +22970,16 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -17999,6 +22989,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -18012,7 +23003,13 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18031,7 +23028,13 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18052,6 +23055,12 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -18074,7 +23083,13 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18091,7 +23106,13 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18108,7 +23129,13 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -18126,7 +23153,13 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -18144,7 +23177,13 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18163,7 +23202,13 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18182,7 +23227,13 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18202,7 +23253,13 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18223,7 +23280,13 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX1250-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -18255,6 +23318,12 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -18282,11 +23351,16 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -18296,6 +23370,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -18309,7 +23384,13 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18328,7 +23409,13 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18349,6 +23436,12 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -18371,7 +23464,13 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18388,7 +23487,13 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18405,7 +23510,13 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -18423,7 +23534,13 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -18441,7 +23558,13 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18460,7 +23583,13 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18479,7 +23608,13 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18499,7 +23634,13 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18520,7 +23661,13 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX1250-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -18552,6 +23699,12 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -18579,11 +23732,16 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -18593,6 +23751,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -18606,7 +23765,13 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18625,7 +23790,13 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18646,6 +23817,12 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -18668,7 +23845,13 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18685,7 +23868,13 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18702,7 +23891,13 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -18720,7 +23915,13 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -18738,7 +23939,13 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18757,7 +23964,13 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18776,7 +23989,13 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18796,7 +24015,13 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18817,7 +24042,13 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX1250-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -18849,6 +24080,12 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -18876,11 +24113,16 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -18890,6 +24132,7 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -18903,7 +24146,13 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_one_as_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18922,7 +24171,13 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_one_as_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18943,6 +24198,12 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -18965,7 +24226,13 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18982,7 +24249,13 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18999,7 +24272,13 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_one_as_release_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -19017,7 +24296,13 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_agent_one_as_release_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -19035,7 +24320,13 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_one_as_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19054,7 +24345,13 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_one_as_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19073,7 +24370,13 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_one_as_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19093,7 +24396,13 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_one_as_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19114,7 +24423,13 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg( ; GFX1250-LABEL: global_agent_one_as_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -19146,6 +24461,12 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -19173,11 +24494,16 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -19187,6 +24513,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -19200,7 +24527,13 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19219,7 +24552,13 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19240,6 +24579,12 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -19262,7 +24607,13 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19279,7 +24630,13 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19296,7 +24653,13 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -19314,7 +24677,13 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -19332,7 +24701,13 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19351,7 +24726,13 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19370,7 +24751,13 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19390,7 +24777,13 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19411,7 +24804,13 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX1250-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -19443,6 +24842,12 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -19470,11 +24875,16 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -19484,6 +24894,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -19497,7 +24908,13 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19516,7 +24933,13 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19537,6 +24960,12 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -19559,7 +24988,13 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19576,7 +25011,13 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19593,7 +25034,13 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -19611,7 +25058,13 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -19629,7 +25082,13 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19648,7 +25107,13 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19667,7 +25132,13 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19687,7 +25158,13 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19708,7 +25185,13 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX1250-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -19740,6 +25223,12 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -19768,6 +25257,12 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -19795,7 +25290,13 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19811,7 +25312,13 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19829,6 +25336,12 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -19852,7 +25365,13 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19868,7 +25387,13 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19884,7 +25409,13 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -19900,7 +25431,13 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -19916,7 +25453,13 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19932,7 +25475,13 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19948,7 +25497,13 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19964,7 +25519,13 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19981,7 +25542,13 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX1250-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -20009,6 +25576,12 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -20038,6 +25611,12 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -20066,7 +25645,13 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20084,7 +25669,13 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20104,6 +25695,12 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -20127,7 +25724,13 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20144,7 +25747,13 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20161,7 +25770,13 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -20178,7 +25793,13 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -20195,7 +25816,13 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20213,7 +25840,13 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20231,7 +25864,13 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20248,7 +25887,13 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20266,7 +25911,13 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX1250-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -20296,6 +25947,12 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -20326,6 +25983,12 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -20355,7 +26018,13 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20375,7 +26044,13 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20397,6 +26072,12 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -20421,7 +26102,13 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20439,7 +26126,13 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20457,7 +26150,13 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -20476,7 +26175,13 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -20495,7 +26200,13 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20515,7 +26226,13 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20535,7 +26252,13 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20556,7 +26279,13 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20578,7 +26307,13 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX1250-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -20613,6 +26348,12 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -20643,6 +26384,12 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -20672,7 +26419,13 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20692,7 +26445,13 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20714,6 +26473,12 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -20738,7 +26503,13 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20756,7 +26527,13 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20774,7 +26551,13 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -20793,7 +26576,13 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -20812,7 +26601,13 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20832,7 +26627,13 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20852,7 +26653,13 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20873,7 +26680,13 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20895,7 +26708,13 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX1250-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -20930,6 +26749,12 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -20959,6 +26784,12 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -20987,7 +26818,13 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21005,7 +26842,13 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21025,6 +26868,12 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -21048,7 +26897,13 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21065,7 +26920,13 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21082,7 +26943,13 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -21099,7 +26966,13 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -21116,7 +26989,13 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21134,7 +27013,13 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21152,7 +27037,13 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21169,7 +27060,13 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21187,7 +27084,13 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX1250-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -21217,6 +27120,12 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -21246,6 +27155,12 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -21274,7 +27189,13 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21292,7 +27213,13 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21312,6 +27239,12 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -21335,7 +27268,13 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21352,7 +27291,13 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21369,7 +27314,13 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -21386,7 +27337,13 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -21403,7 +27360,13 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21421,7 +27384,13 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21439,7 +27408,13 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21456,7 +27431,13 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21474,7 +27455,13 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX1250-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -21504,6 +27491,12 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -21534,6 +27527,12 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -21563,7 +27562,13 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21583,7 +27588,13 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21605,6 +27616,12 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -21629,7 +27646,13 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21647,7 +27670,13 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21665,7 +27694,13 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -21684,7 +27719,13 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -21703,7 +27744,13 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21723,7 +27770,13 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21743,7 +27796,13 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21764,7 +27823,13 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21786,7 +27851,13 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg( ; GFX1250-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -21821,6 +27892,12 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -21851,6 +27928,12 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -21880,7 +27963,13 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21900,7 +27989,13 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21922,6 +28017,12 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -21946,7 +28047,13 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21964,7 +28071,13 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21982,7 +28095,13 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -22001,7 +28120,13 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -22020,7 +28145,13 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -22040,7 +28171,13 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -22060,7 +28197,13 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -22081,7 +28224,13 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -22103,7 +28252,13 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX1250-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -22138,6 +28293,12 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -22168,6 +28329,12 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -22197,7 +28364,13 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -22217,7 +28390,13 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -22239,6 +28418,12 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -22263,7 +28448,13 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -22281,7 +28472,13 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -22299,7 +28496,13 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -22318,7 +28521,13 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -22337,7 +28546,13 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -22357,7 +28572,13 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -22377,7 +28598,13 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -22398,7 +28625,13 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -22420,7 +28653,13 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX1250-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -22455,6 +28694,12 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -22485,6 +28730,12 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -22514,7 +28765,13 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -22534,7 +28791,13 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -22556,6 +28819,12 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -22580,7 +28849,13 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -22598,7 +28873,13 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -22616,7 +28897,13 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -22635,7 +28922,13 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -22654,7 +28947,13 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -22674,7 +28973,13 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -22694,7 +28999,13 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -22715,7 +29026,13 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -22737,7 +29054,13 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -22772,6 +29095,12 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -22802,6 +29131,12 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -22831,7 +29166,13 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -22851,7 +29192,13 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -22873,6 +29220,12 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -22897,7 +29250,13 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -22915,7 +29274,13 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -22933,7 +29298,13 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -22952,7 +29323,13 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -22971,7 +29348,13 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -22991,7 +29374,13 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -23011,7 +29400,13 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -23032,7 +29427,13 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -23054,7 +29455,13 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -23089,6 +29496,12 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -23119,6 +29532,12 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -23148,7 +29567,13 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -23168,7 +29593,13 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -23190,6 +29621,12 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -23214,7 +29651,13 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -23232,7 +29675,13 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -23250,7 +29699,13 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -23269,7 +29724,13 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -23288,7 +29749,13 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -23308,7 +29775,13 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -23328,7 +29801,13 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -23349,7 +29828,13 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -23371,7 +29856,13 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -23406,6 +29897,12 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -23436,6 +29933,12 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -23465,7 +29968,13 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -23485,7 +29994,13 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -23507,6 +30022,12 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -23531,7 +30052,13 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -23549,7 +30076,13 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -23567,7 +30100,13 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -23586,7 +30125,13 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -23605,7 +30150,13 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -23625,7 +30176,13 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -23645,7 +30202,13 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -23666,7 +30229,13 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -23688,7 +30257,13 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -23723,6 +30298,12 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -23753,6 +30334,12 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -23782,7 +30369,13 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -23802,7 +30395,13 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -23824,6 +30423,12 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -23848,7 +30453,13 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -23866,7 +30477,13 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -23884,7 +30501,13 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -23903,7 +30526,13 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -23922,7 +30551,13 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -23942,7 +30577,13 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -23962,7 +30603,13 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -23983,7 +30630,13 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -24005,7 +30658,13 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-cluster.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-cluster.ll index ad47cbcb5017..ff45d32a51b6 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-cluster.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-cluster.ll @@ -18,6 +18,9 @@ define amdgpu_kernel void @global_cluster_unordered_load( ; GFX6-LABEL: global_cluster_unordered_load: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -45,12 +48,16 @@ define amdgpu_kernel void @global_cluster_unordered_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -59,29 +66,38 @@ define amdgpu_kernel void @global_cluster_unordered_load( ; ; GFX10-WGP-LABEL: global_cluster_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_cluster_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_cluster_unordered_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -106,101 +122,131 @@ define amdgpu_kernel void @global_cluster_unordered_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_cluster_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_unordered_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_cluster_unordered_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_cluster_unordered_load: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_cluster_unordered_load: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_cluster_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_cluster_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_cluster_unordered_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -214,6 +260,9 @@ define amdgpu_kernel void @global_cluster_monotonic_load( ; GFX6-LABEL: global_cluster_monotonic_load: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -241,12 +290,16 @@ define amdgpu_kernel void @global_cluster_monotonic_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] glc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -255,29 +308,38 @@ define amdgpu_kernel void @global_cluster_monotonic_load( ; ; GFX10-WGP-LABEL: global_cluster_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] glc dlc -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_cluster_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] glc dlc -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_cluster_monotonic_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -302,101 +364,131 @@ define amdgpu_kernel void @global_cluster_monotonic_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_cluster_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_monotonic_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_cluster_monotonic_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc1 -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_cluster_monotonic_load: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] glc -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_cluster_monotonic_load: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] glc -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_cluster_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_cluster_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_cluster_monotonic_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SE ; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -410,6 +502,9 @@ define amdgpu_kernel void @global_cluster_acquire_load( ; GFX6-LABEL: global_cluster_acquire_load: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -438,14 +533,18 @@ define amdgpu_kernel void @global_cluster_acquire_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: flat_store_dword v[0:1], v2 @@ -453,33 +552,44 @@ define amdgpu_kernel void @global_cluster_acquire_load( ; ; GFX10-WGP-LABEL: global_cluster_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_cluster_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_cluster_acquire_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -504,112 +614,148 @@ define amdgpu_kernel void @global_cluster_acquire_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_cluster_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_acquire_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc1 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_cluster_acquire_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_cluster_acquire_load: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_cluster_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_cluster_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_cluster_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_cluster_acquire_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SE ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -623,6 +769,9 @@ define amdgpu_kernel void @global_cluster_seq_cst_load( ; GFX6-LABEL: global_cluster_seq_cst_load: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -652,9 +801,12 @@ define amdgpu_kernel void @global_cluster_seq_cst_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -668,8 +820,12 @@ define amdgpu_kernel void @global_cluster_seq_cst_load( ; ; GFX10-WGP-LABEL: global_cluster_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -682,8 +838,12 @@ define amdgpu_kernel void @global_cluster_seq_cst_load( ; ; GFX10-CU-LABEL: global_cluster_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -697,6 +857,9 @@ define amdgpu_kernel void @global_cluster_seq_cst_load( ; SKIP-CACHE-INV-LABEL: global_cluster_seq_cst_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -722,8 +885,12 @@ define amdgpu_kernel void @global_cluster_seq_cst_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc @@ -734,8 +901,12 @@ define amdgpu_kernel void @global_cluster_seq_cst_load( ; ; GFX90A-TGSPLIT-LABEL: global_cluster_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc @@ -746,8 +917,12 @@ define amdgpu_kernel void @global_cluster_seq_cst_load( ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_seq_cst_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc1 @@ -758,8 +933,12 @@ define amdgpu_kernel void @global_cluster_seq_cst_load( ; ; GFX942-TGSPLIT-LABEL: global_cluster_seq_cst_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc1 @@ -770,8 +949,12 @@ define amdgpu_kernel void @global_cluster_seq_cst_load( ; ; GFX11-WGP-LABEL: global_cluster_seq_cst_load: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -784,8 +967,12 @@ define amdgpu_kernel void @global_cluster_seq_cst_load( ; ; GFX11-CU-LABEL: global_cluster_seq_cst_load: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -798,48 +985,60 @@ define amdgpu_kernel void @global_cluster_seq_cst_load( ; ; GFX12-WGP-LABEL: global_cluster_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_cluster_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_cluster_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SE ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -853,6 +1052,9 @@ define amdgpu_kernel void @global_cluster_unordered_store( ; GFX6-LABEL: global_cluster_unordered_store: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -874,6 +1076,10 @@ define amdgpu_kernel void @global_cluster_unordered_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -884,27 +1090,38 @@ define amdgpu_kernel void @global_cluster_unordered_store( ; ; GFX10-WGP-LABEL: global_cluster_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_cluster_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_cluster_unordered_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -922,93 +1139,129 @@ define amdgpu_kernel void @global_cluster_unordered_store( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_cluster_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_unordered_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_cluster_unordered_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_cluster_unordered_store: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_cluster_unordered_store: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_cluster_unordered_store: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_cluster_unordered_store: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_cluster_unordered_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -1021,6 +1274,9 @@ define amdgpu_kernel void @global_cluster_monotonic_store( ; GFX6-LABEL: global_cluster_monotonic_store: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -1042,6 +1298,10 @@ define amdgpu_kernel void @global_cluster_monotonic_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -1052,27 +1312,38 @@ define amdgpu_kernel void @global_cluster_monotonic_store( ; ; GFX10-WGP-LABEL: global_cluster_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_cluster_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_cluster_monotonic_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -1090,93 +1361,129 @@ define amdgpu_kernel void @global_cluster_monotonic_store( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_cluster_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_monotonic_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_cluster_monotonic_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_cluster_monotonic_store: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_cluster_monotonic_store: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_cluster_monotonic_store: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_cluster_monotonic_store: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_cluster_monotonic_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -1189,6 +1496,9 @@ define amdgpu_kernel void @global_cluster_release_store( ; GFX6-LABEL: global_cluster_release_store: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -1211,6 +1521,10 @@ define amdgpu_kernel void @global_cluster_release_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -1222,10 +1536,13 @@ define amdgpu_kernel void @global_cluster_release_store( ; ; GFX10-WGP-LABEL: global_cluster_release_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1234,10 +1551,13 @@ define amdgpu_kernel void @global_cluster_release_store( ; ; GFX10-CU-LABEL: global_cluster_release_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1247,6 +1567,9 @@ define amdgpu_kernel void @global_cluster_release_store( ; SKIP-CACHE-INV-LABEL: global_cluster_release_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -1265,10 +1588,13 @@ define amdgpu_kernel void @global_cluster_release_store( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -1276,10 +1602,13 @@ define amdgpu_kernel void @global_cluster_release_store( ; ; GFX90A-TGSPLIT-LABEL: global_cluster_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -1287,10 +1616,13 @@ define amdgpu_kernel void @global_cluster_release_store( ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_release_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1299,10 +1631,13 @@ define amdgpu_kernel void @global_cluster_release_store( ; ; GFX942-TGSPLIT-LABEL: global_cluster_release_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1311,10 +1646,13 @@ define amdgpu_kernel void @global_cluster_release_store( ; ; GFX11-WGP-LABEL: global_cluster_release_store: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1323,10 +1661,13 @@ define amdgpu_kernel void @global_cluster_release_store( ; ; GFX11-CU-LABEL: global_cluster_release_store: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1335,43 +1676,55 @@ define amdgpu_kernel void @global_cluster_release_store( ; ; GFX12-WGP-LABEL: global_cluster_release_store: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_cluster_release_store: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_cluster_release_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -1384,6 +1737,9 @@ define amdgpu_kernel void @global_cluster_seq_cst_store( ; GFX6-LABEL: global_cluster_seq_cst_store: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -1406,6 +1762,10 @@ define amdgpu_kernel void @global_cluster_seq_cst_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -1417,10 +1777,13 @@ define amdgpu_kernel void @global_cluster_seq_cst_store( ; ; GFX10-WGP-LABEL: global_cluster_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1429,10 +1792,13 @@ define amdgpu_kernel void @global_cluster_seq_cst_store( ; ; GFX10-CU-LABEL: global_cluster_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1442,6 +1808,9 @@ define amdgpu_kernel void @global_cluster_seq_cst_store( ; SKIP-CACHE-INV-LABEL: global_cluster_seq_cst_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -1460,10 +1829,13 @@ define amdgpu_kernel void @global_cluster_seq_cst_store( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -1471,10 +1843,13 @@ define amdgpu_kernel void @global_cluster_seq_cst_store( ; ; GFX90A-TGSPLIT-LABEL: global_cluster_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -1482,10 +1857,13 @@ define amdgpu_kernel void @global_cluster_seq_cst_store( ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_seq_cst_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1494,10 +1872,13 @@ define amdgpu_kernel void @global_cluster_seq_cst_store( ; ; GFX942-TGSPLIT-LABEL: global_cluster_seq_cst_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1506,10 +1887,13 @@ define amdgpu_kernel void @global_cluster_seq_cst_store( ; ; GFX11-WGP-LABEL: global_cluster_seq_cst_store: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1518,10 +1902,13 @@ define amdgpu_kernel void @global_cluster_seq_cst_store( ; ; GFX11-CU-LABEL: global_cluster_seq_cst_store: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1530,43 +1917,55 @@ define amdgpu_kernel void @global_cluster_seq_cst_store( ; ; GFX12-WGP-LABEL: global_cluster_seq_cst_store: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_cluster_seq_cst_store: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_cluster_seq_cst_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -1579,6 +1978,10 @@ define amdgpu_kernel void @global_cluster_monotonic_atomicrmw( ; GFX6-LABEL: global_cluster_monotonic_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -1598,18 +2001,26 @@ define amdgpu_kernel void @global_cluster_monotonic_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_cluster_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1619,7 +2030,11 @@ define amdgpu_kernel void @global_cluster_monotonic_atomicrmw( ; ; GFX10-CU-LABEL: global_cluster_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1630,6 +2045,10 @@ define amdgpu_kernel void @global_cluster_monotonic_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_cluster_monotonic_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -1646,7 +2065,11 @@ define amdgpu_kernel void @global_cluster_monotonic_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1656,7 +2079,11 @@ define amdgpu_kernel void @global_cluster_monotonic_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_cluster_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1666,7 +2093,11 @@ define amdgpu_kernel void @global_cluster_monotonic_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_monotonic_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1676,7 +2107,11 @@ define amdgpu_kernel void @global_cluster_monotonic_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_cluster_monotonic_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1686,7 +2121,11 @@ define amdgpu_kernel void @global_cluster_monotonic_atomicrmw( ; ; GFX11-WGP-LABEL: global_cluster_monotonic_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1696,7 +2135,11 @@ define amdgpu_kernel void @global_cluster_monotonic_atomicrmw( ; ; GFX11-CU-LABEL: global_cluster_monotonic_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1706,7 +2149,11 @@ define amdgpu_kernel void @global_cluster_monotonic_atomicrmw( ; ; GFX12-WGP-LABEL: global_cluster_monotonic_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -1716,7 +2163,11 @@ define amdgpu_kernel void @global_cluster_monotonic_atomicrmw( ; ; GFX12-CU-LABEL: global_cluster_monotonic_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -1727,7 +2178,11 @@ define amdgpu_kernel void @global_cluster_monotonic_atomicrmw( ; GFX1250-LABEL: global_cluster_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -1745,6 +2200,10 @@ define amdgpu_kernel void @global_cluster_acquire_atomicrmw( ; GFX6-LABEL: global_cluster_acquire_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -1766,11 +2225,15 @@ define amdgpu_kernel void @global_cluster_acquire_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -1779,7 +2242,11 @@ define amdgpu_kernel void @global_cluster_acquire_atomicrmw( ; ; GFX10-WGP-LABEL: global_cluster_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1792,7 +2259,11 @@ define amdgpu_kernel void @global_cluster_acquire_atomicrmw( ; ; GFX10-CU-LABEL: global_cluster_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1806,6 +2277,10 @@ define amdgpu_kernel void @global_cluster_acquire_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_cluster_acquire_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -1823,7 +2298,11 @@ define amdgpu_kernel void @global_cluster_acquire_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1835,7 +2314,11 @@ define amdgpu_kernel void @global_cluster_acquire_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_cluster_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1847,7 +2330,11 @@ define amdgpu_kernel void @global_cluster_acquire_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_acquire_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1859,7 +2346,11 @@ define amdgpu_kernel void @global_cluster_acquire_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_cluster_acquire_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1871,7 +2362,11 @@ define amdgpu_kernel void @global_cluster_acquire_atomicrmw( ; ; GFX11-WGP-LABEL: global_cluster_acquire_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1884,7 +2379,11 @@ define amdgpu_kernel void @global_cluster_acquire_atomicrmw( ; ; GFX11-CU-LABEL: global_cluster_acquire_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1897,7 +2396,11 @@ define amdgpu_kernel void @global_cluster_acquire_atomicrmw( ; ; GFX12-WGP-LABEL: global_cluster_acquire_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -1909,7 +2412,11 @@ define amdgpu_kernel void @global_cluster_acquire_atomicrmw( ; ; GFX12-CU-LABEL: global_cluster_acquire_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -1922,7 +2429,11 @@ define amdgpu_kernel void @global_cluster_acquire_atomicrmw( ; GFX1250-LABEL: global_cluster_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -1942,6 +2453,10 @@ define amdgpu_kernel void @global_cluster_release_atomicrmw( ; GFX6-LABEL: global_cluster_release_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -1962,11 +2477,15 @@ define amdgpu_kernel void @global_cluster_release_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 @@ -1974,7 +2493,11 @@ define amdgpu_kernel void @global_cluster_release_atomicrmw( ; ; GFX10-WGP-LABEL: global_cluster_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1986,7 +2509,11 @@ define amdgpu_kernel void @global_cluster_release_atomicrmw( ; ; GFX10-CU-LABEL: global_cluster_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1999,6 +2526,10 @@ define amdgpu_kernel void @global_cluster_release_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_cluster_release_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -2016,7 +2547,11 @@ define amdgpu_kernel void @global_cluster_release_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2027,7 +2562,11 @@ define amdgpu_kernel void @global_cluster_release_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_cluster_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2038,7 +2577,11 @@ define amdgpu_kernel void @global_cluster_release_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_release_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2050,7 +2593,11 @@ define amdgpu_kernel void @global_cluster_release_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_cluster_release_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2062,7 +2609,11 @@ define amdgpu_kernel void @global_cluster_release_atomicrmw( ; ; GFX11-WGP-LABEL: global_cluster_release_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2074,7 +2625,11 @@ define amdgpu_kernel void @global_cluster_release_atomicrmw( ; ; GFX11-CU-LABEL: global_cluster_release_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2086,7 +2641,11 @@ define amdgpu_kernel void @global_cluster_release_atomicrmw( ; ; GFX12-WGP-LABEL: global_cluster_release_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -2100,7 +2659,11 @@ define amdgpu_kernel void @global_cluster_release_atomicrmw( ; ; GFX12-CU-LABEL: global_cluster_release_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -2115,7 +2678,11 @@ define amdgpu_kernel void @global_cluster_release_atomicrmw( ; GFX1250-LABEL: global_cluster_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2135,6 +2702,10 @@ define amdgpu_kernel void @global_cluster_acq_rel_atomicrmw( ; GFX6-LABEL: global_cluster_acq_rel_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -2157,11 +2728,15 @@ define amdgpu_kernel void @global_cluster_acq_rel_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 @@ -2171,7 +2746,11 @@ define amdgpu_kernel void @global_cluster_acq_rel_atomicrmw( ; ; GFX10-WGP-LABEL: global_cluster_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2186,7 +2765,11 @@ define amdgpu_kernel void @global_cluster_acq_rel_atomicrmw( ; ; GFX10-CU-LABEL: global_cluster_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2202,6 +2785,10 @@ define amdgpu_kernel void @global_cluster_acq_rel_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_cluster_acq_rel_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -2220,7 +2807,11 @@ define amdgpu_kernel void @global_cluster_acq_rel_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2233,7 +2824,11 @@ define amdgpu_kernel void @global_cluster_acq_rel_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_cluster_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2246,7 +2841,11 @@ define amdgpu_kernel void @global_cluster_acq_rel_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_acq_rel_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2260,7 +2859,11 @@ define amdgpu_kernel void @global_cluster_acq_rel_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_cluster_acq_rel_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2274,7 +2877,11 @@ define amdgpu_kernel void @global_cluster_acq_rel_atomicrmw( ; ; GFX11-WGP-LABEL: global_cluster_acq_rel_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2289,7 +2896,11 @@ define amdgpu_kernel void @global_cluster_acq_rel_atomicrmw( ; ; GFX11-CU-LABEL: global_cluster_acq_rel_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2304,7 +2915,11 @@ define amdgpu_kernel void @global_cluster_acq_rel_atomicrmw( ; ; GFX12-WGP-LABEL: global_cluster_acq_rel_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -2320,7 +2935,11 @@ define amdgpu_kernel void @global_cluster_acq_rel_atomicrmw( ; ; GFX12-CU-LABEL: global_cluster_acq_rel_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -2337,7 +2956,11 @@ define amdgpu_kernel void @global_cluster_acq_rel_atomicrmw( ; GFX1250-LABEL: global_cluster_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2359,6 +2982,10 @@ define amdgpu_kernel void @global_cluster_seq_cst_atomicrmw( ; GFX6-LABEL: global_cluster_seq_cst_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -2381,11 +3008,15 @@ define amdgpu_kernel void @global_cluster_seq_cst_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 @@ -2395,7 +3026,11 @@ define amdgpu_kernel void @global_cluster_seq_cst_atomicrmw( ; ; GFX10-WGP-LABEL: global_cluster_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2410,7 +3045,11 @@ define amdgpu_kernel void @global_cluster_seq_cst_atomicrmw( ; ; GFX10-CU-LABEL: global_cluster_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2426,6 +3065,10 @@ define amdgpu_kernel void @global_cluster_seq_cst_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_cluster_seq_cst_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -2444,7 +3087,11 @@ define amdgpu_kernel void @global_cluster_seq_cst_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2457,7 +3104,11 @@ define amdgpu_kernel void @global_cluster_seq_cst_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_cluster_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2470,7 +3121,11 @@ define amdgpu_kernel void @global_cluster_seq_cst_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_seq_cst_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2484,7 +3139,11 @@ define amdgpu_kernel void @global_cluster_seq_cst_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_cluster_seq_cst_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2498,7 +3157,11 @@ define amdgpu_kernel void @global_cluster_seq_cst_atomicrmw( ; ; GFX11-WGP-LABEL: global_cluster_seq_cst_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2513,7 +3176,11 @@ define amdgpu_kernel void @global_cluster_seq_cst_atomicrmw( ; ; GFX11-CU-LABEL: global_cluster_seq_cst_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2528,7 +3195,11 @@ define amdgpu_kernel void @global_cluster_seq_cst_atomicrmw( ; ; GFX12-WGP-LABEL: global_cluster_seq_cst_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -2544,7 +3215,11 @@ define amdgpu_kernel void @global_cluster_seq_cst_atomicrmw( ; ; GFX12-CU-LABEL: global_cluster_seq_cst_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -2561,7 +3236,11 @@ define amdgpu_kernel void @global_cluster_seq_cst_atomicrmw( ; GFX1250-LABEL: global_cluster_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2583,6 +3262,10 @@ define amdgpu_kernel void @global_cluster_acquire_ret_atomicrmw( ; GFX6-LABEL: global_cluster_acquire_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -2606,6 +3289,10 @@ define amdgpu_kernel void @global_cluster_acquire_ret_atomicrmw( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -2621,7 +3308,11 @@ define amdgpu_kernel void @global_cluster_acquire_ret_atomicrmw( ; ; GFX10-WGP-LABEL: global_cluster_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2635,7 +3326,11 @@ define amdgpu_kernel void @global_cluster_acquire_ret_atomicrmw( ; ; GFX10-CU-LABEL: global_cluster_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2650,6 +3345,10 @@ define amdgpu_kernel void @global_cluster_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_cluster_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -2668,7 +3367,11 @@ define amdgpu_kernel void @global_cluster_acquire_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2681,7 +3384,11 @@ define amdgpu_kernel void @global_cluster_acquire_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_cluster_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2694,7 +3401,11 @@ define amdgpu_kernel void @global_cluster_acquire_ret_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_acquire_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2707,7 +3418,11 @@ define amdgpu_kernel void @global_cluster_acquire_ret_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_cluster_acquire_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2720,7 +3435,11 @@ define amdgpu_kernel void @global_cluster_acquire_ret_atomicrmw( ; ; GFX11-WGP-LABEL: global_cluster_acquire_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2734,7 +3453,11 @@ define amdgpu_kernel void @global_cluster_acquire_ret_atomicrmw( ; ; GFX11-CU-LABEL: global_cluster_acquire_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2748,7 +3471,11 @@ define amdgpu_kernel void @global_cluster_acquire_ret_atomicrmw( ; ; GFX12-WGP-LABEL: global_cluster_acquire_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -2761,7 +3488,11 @@ define amdgpu_kernel void @global_cluster_acquire_ret_atomicrmw( ; ; GFX12-CU-LABEL: global_cluster_acquire_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -2775,7 +3506,11 @@ define amdgpu_kernel void @global_cluster_acquire_ret_atomicrmw( ; GFX1250-LABEL: global_cluster_acquire_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2797,6 +3532,10 @@ define amdgpu_kernel void @global_cluster_acq_rel_ret_atomicrmw( ; GFX6-LABEL: global_cluster_acq_rel_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -2821,6 +3560,10 @@ define amdgpu_kernel void @global_cluster_acq_rel_ret_atomicrmw( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -2837,7 +3580,11 @@ define amdgpu_kernel void @global_cluster_acq_rel_ret_atomicrmw( ; ; GFX10-WGP-LABEL: global_cluster_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2853,7 +3600,11 @@ define amdgpu_kernel void @global_cluster_acq_rel_ret_atomicrmw( ; ; GFX10-CU-LABEL: global_cluster_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2870,6 +3621,10 @@ define amdgpu_kernel void @global_cluster_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_cluster_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -2889,7 +3644,11 @@ define amdgpu_kernel void @global_cluster_acq_rel_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2903,7 +3662,11 @@ define amdgpu_kernel void @global_cluster_acq_rel_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_cluster_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2917,7 +3680,11 @@ define amdgpu_kernel void @global_cluster_acq_rel_ret_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_acq_rel_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2932,7 +3699,11 @@ define amdgpu_kernel void @global_cluster_acq_rel_ret_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_cluster_acq_rel_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2947,7 +3718,11 @@ define amdgpu_kernel void @global_cluster_acq_rel_ret_atomicrmw( ; ; GFX11-WGP-LABEL: global_cluster_acq_rel_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2963,7 +3738,11 @@ define amdgpu_kernel void @global_cluster_acq_rel_ret_atomicrmw( ; ; GFX11-CU-LABEL: global_cluster_acq_rel_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2979,7 +3758,11 @@ define amdgpu_kernel void @global_cluster_acq_rel_ret_atomicrmw( ; ; GFX12-WGP-LABEL: global_cluster_acq_rel_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -2996,7 +3779,11 @@ define amdgpu_kernel void @global_cluster_acq_rel_ret_atomicrmw( ; ; GFX12-CU-LABEL: global_cluster_acq_rel_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -3014,7 +3801,11 @@ define amdgpu_kernel void @global_cluster_acq_rel_ret_atomicrmw( ; GFX1250-LABEL: global_cluster_acq_rel_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -3038,6 +3829,10 @@ define amdgpu_kernel void @global_cluster_seq_cst_ret_atomicrmw( ; GFX6-LABEL: global_cluster_seq_cst_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -3062,6 +3857,10 @@ define amdgpu_kernel void @global_cluster_seq_cst_ret_atomicrmw( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -3078,7 +3877,11 @@ define amdgpu_kernel void @global_cluster_seq_cst_ret_atomicrmw( ; ; GFX10-WGP-LABEL: global_cluster_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -3094,7 +3897,11 @@ define amdgpu_kernel void @global_cluster_seq_cst_ret_atomicrmw( ; ; GFX10-CU-LABEL: global_cluster_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -3111,6 +3918,10 @@ define amdgpu_kernel void @global_cluster_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_cluster_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -3130,7 +3941,11 @@ define amdgpu_kernel void @global_cluster_seq_cst_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3144,7 +3959,11 @@ define amdgpu_kernel void @global_cluster_seq_cst_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_cluster_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3158,7 +3977,11 @@ define amdgpu_kernel void @global_cluster_seq_cst_ret_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_seq_cst_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3173,7 +3996,11 @@ define amdgpu_kernel void @global_cluster_seq_cst_ret_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_cluster_seq_cst_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3188,7 +4015,11 @@ define amdgpu_kernel void @global_cluster_seq_cst_ret_atomicrmw( ; ; GFX11-WGP-LABEL: global_cluster_seq_cst_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -3204,7 +4035,11 @@ define amdgpu_kernel void @global_cluster_seq_cst_ret_atomicrmw( ; ; GFX11-CU-LABEL: global_cluster_seq_cst_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -3220,7 +4055,11 @@ define amdgpu_kernel void @global_cluster_seq_cst_ret_atomicrmw( ; ; GFX12-WGP-LABEL: global_cluster_seq_cst_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -3237,7 +4076,11 @@ define amdgpu_kernel void @global_cluster_seq_cst_ret_atomicrmw( ; ; GFX12-CU-LABEL: global_cluster_seq_cst_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -3255,7 +4098,11 @@ define amdgpu_kernel void @global_cluster_seq_cst_ret_atomicrmw( ; GFX1250-LABEL: global_cluster_seq_cst_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -3280,6 +4127,12 @@ define amdgpu_kernel void @global_cluster_monotonic_monotonic_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -3304,11 +4157,16 @@ define amdgpu_kernel void @global_cluster_monotonic_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -3318,6 +4176,7 @@ define amdgpu_kernel void @global_cluster_monotonic_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -3328,7 +4187,13 @@ define amdgpu_kernel void @global_cluster_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_cluster_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3342,7 +4207,13 @@ define amdgpu_kernel void @global_cluster_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_cluster_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3358,6 +4229,12 @@ define amdgpu_kernel void @global_cluster_monotonic_monotonic_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -3378,7 +4255,13 @@ define amdgpu_kernel void @global_cluster_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3392,7 +4275,13 @@ define amdgpu_kernel void @global_cluster_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_cluster_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3406,7 +4295,13 @@ define amdgpu_kernel void @global_cluster_monotonic_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_monotonic_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -3420,7 +4315,13 @@ define amdgpu_kernel void @global_cluster_monotonic_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_cluster_monotonic_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -3434,7 +4335,13 @@ define amdgpu_kernel void @global_cluster_monotonic_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_cluster_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3448,7 +4355,13 @@ define amdgpu_kernel void @global_cluster_monotonic_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_cluster_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3462,7 +4375,13 @@ define amdgpu_kernel void @global_cluster_monotonic_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_cluster_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3476,7 +4395,13 @@ define amdgpu_kernel void @global_cluster_monotonic_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_cluster_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3491,7 +4416,13 @@ define amdgpu_kernel void @global_cluster_monotonic_monotonic_cmpxchg( ; GFX1250-LABEL: global_cluster_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -3515,6 +4446,12 @@ define amdgpu_kernel void @global_cluster_acquire_monotonic_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -3541,11 +4478,16 @@ define amdgpu_kernel void @global_cluster_acquire_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -3555,6 +4497,7 @@ define amdgpu_kernel void @global_cluster_acquire_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -3567,7 +4510,13 @@ define amdgpu_kernel void @global_cluster_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_cluster_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3584,7 +4533,13 @@ define amdgpu_kernel void @global_cluster_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_cluster_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3603,6 +4558,12 @@ define amdgpu_kernel void @global_cluster_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -3624,7 +4585,13 @@ define amdgpu_kernel void @global_cluster_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3640,7 +4607,13 @@ define amdgpu_kernel void @global_cluster_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_cluster_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3656,7 +4629,13 @@ define amdgpu_kernel void @global_cluster_acquire_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_acquire_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -3672,7 +4651,13 @@ define amdgpu_kernel void @global_cluster_acquire_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_cluster_acquire_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -3688,7 +4673,13 @@ define amdgpu_kernel void @global_cluster_acquire_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_cluster_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3705,7 +4696,13 @@ define amdgpu_kernel void @global_cluster_acquire_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_cluster_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3722,7 +4719,13 @@ define amdgpu_kernel void @global_cluster_acquire_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_cluster_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3738,7 +4741,13 @@ define amdgpu_kernel void @global_cluster_acquire_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_cluster_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3755,7 +4764,13 @@ define amdgpu_kernel void @global_cluster_acquire_monotonic_cmpxchg( ; GFX1250-LABEL: global_cluster_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -3781,6 +4796,12 @@ define amdgpu_kernel void @global_cluster_release_monotonic_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -3806,11 +4827,16 @@ define amdgpu_kernel void @global_cluster_release_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -3820,6 +4846,7 @@ define amdgpu_kernel void @global_cluster_release_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -3831,7 +4858,13 @@ define amdgpu_kernel void @global_cluster_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_cluster_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3847,7 +4880,13 @@ define amdgpu_kernel void @global_cluster_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_cluster_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3865,6 +4904,12 @@ define amdgpu_kernel void @global_cluster_release_monotonic_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -3886,7 +4931,13 @@ define amdgpu_kernel void @global_cluster_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3901,7 +4952,13 @@ define amdgpu_kernel void @global_cluster_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_cluster_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3916,7 +4973,13 @@ define amdgpu_kernel void @global_cluster_release_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_release_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -3932,7 +4995,13 @@ define amdgpu_kernel void @global_cluster_release_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_cluster_release_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -3948,7 +5017,13 @@ define amdgpu_kernel void @global_cluster_release_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_cluster_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3964,7 +5039,13 @@ define amdgpu_kernel void @global_cluster_release_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_cluster_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3980,7 +5061,13 @@ define amdgpu_kernel void @global_cluster_release_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_cluster_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3998,7 +5085,13 @@ define amdgpu_kernel void @global_cluster_release_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_cluster_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4017,7 +5110,13 @@ define amdgpu_kernel void @global_cluster_release_monotonic_cmpxchg( ; GFX1250-LABEL: global_cluster_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -4043,6 +5142,12 @@ define amdgpu_kernel void @global_cluster_acq_rel_monotonic_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -4070,11 +5175,16 @@ define amdgpu_kernel void @global_cluster_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -4084,6 +5194,7 @@ define amdgpu_kernel void @global_cluster_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -4097,7 +5208,13 @@ define amdgpu_kernel void @global_cluster_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_cluster_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4116,7 +5233,13 @@ define amdgpu_kernel void @global_cluster_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_cluster_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4137,6 +5260,12 @@ define amdgpu_kernel void @global_cluster_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -4159,7 +5288,13 @@ define amdgpu_kernel void @global_cluster_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4176,7 +5311,13 @@ define amdgpu_kernel void @global_cluster_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_cluster_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4193,7 +5334,13 @@ define amdgpu_kernel void @global_cluster_acq_rel_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_acq_rel_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -4211,7 +5358,13 @@ define amdgpu_kernel void @global_cluster_acq_rel_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_cluster_acq_rel_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -4229,7 +5382,13 @@ define amdgpu_kernel void @global_cluster_acq_rel_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_cluster_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4248,7 +5407,13 @@ define amdgpu_kernel void @global_cluster_acq_rel_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_cluster_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4267,7 +5432,13 @@ define amdgpu_kernel void @global_cluster_acq_rel_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_cluster_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4287,7 +5458,13 @@ define amdgpu_kernel void @global_cluster_acq_rel_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_cluster_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4308,7 +5485,13 @@ define amdgpu_kernel void @global_cluster_acq_rel_monotonic_cmpxchg( ; GFX1250-LABEL: global_cluster_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -4336,6 +5519,12 @@ define amdgpu_kernel void @global_cluster_seq_cst_monotonic_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -4363,11 +5552,16 @@ define amdgpu_kernel void @global_cluster_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -4377,6 +5571,7 @@ define amdgpu_kernel void @global_cluster_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -4390,7 +5585,13 @@ define amdgpu_kernel void @global_cluster_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_cluster_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4409,7 +5610,13 @@ define amdgpu_kernel void @global_cluster_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_cluster_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4430,6 +5637,12 @@ define amdgpu_kernel void @global_cluster_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -4452,7 +5665,13 @@ define amdgpu_kernel void @global_cluster_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4469,7 +5688,13 @@ define amdgpu_kernel void @global_cluster_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_cluster_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4486,7 +5711,13 @@ define amdgpu_kernel void @global_cluster_seq_cst_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_seq_cst_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -4504,7 +5735,13 @@ define amdgpu_kernel void @global_cluster_seq_cst_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_cluster_seq_cst_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -4522,7 +5759,13 @@ define amdgpu_kernel void @global_cluster_seq_cst_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_cluster_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4541,7 +5784,13 @@ define amdgpu_kernel void @global_cluster_seq_cst_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_cluster_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4560,7 +5809,13 @@ define amdgpu_kernel void @global_cluster_seq_cst_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_cluster_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4580,7 +5835,13 @@ define amdgpu_kernel void @global_cluster_seq_cst_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_cluster_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4601,7 +5862,13 @@ define amdgpu_kernel void @global_cluster_seq_cst_monotonic_cmpxchg( ; GFX1250-LABEL: global_cluster_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -4629,6 +5896,12 @@ define amdgpu_kernel void @global_cluster_monotonic_acquire_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -4655,11 +5928,16 @@ define amdgpu_kernel void @global_cluster_monotonic_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -4669,6 +5947,7 @@ define amdgpu_kernel void @global_cluster_monotonic_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -4681,7 +5960,13 @@ define amdgpu_kernel void @global_cluster_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_cluster_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4698,7 +5983,13 @@ define amdgpu_kernel void @global_cluster_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_cluster_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4717,6 +6008,12 @@ define amdgpu_kernel void @global_cluster_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -4738,7 +6035,13 @@ define amdgpu_kernel void @global_cluster_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4754,7 +6057,13 @@ define amdgpu_kernel void @global_cluster_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_cluster_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4770,7 +6079,13 @@ define amdgpu_kernel void @global_cluster_monotonic_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_monotonic_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -4786,7 +6101,13 @@ define amdgpu_kernel void @global_cluster_monotonic_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_cluster_monotonic_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -4802,7 +6123,13 @@ define amdgpu_kernel void @global_cluster_monotonic_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_cluster_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4819,7 +6146,13 @@ define amdgpu_kernel void @global_cluster_monotonic_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_cluster_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4836,7 +6169,13 @@ define amdgpu_kernel void @global_cluster_monotonic_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_cluster_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4852,7 +6191,13 @@ define amdgpu_kernel void @global_cluster_monotonic_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_cluster_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4869,7 +6214,13 @@ define amdgpu_kernel void @global_cluster_monotonic_acquire_cmpxchg( ; GFX1250-LABEL: global_cluster_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -4895,6 +6246,12 @@ define amdgpu_kernel void @global_cluster_acquire_acquire_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -4921,11 +6278,16 @@ define amdgpu_kernel void @global_cluster_acquire_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -4935,6 +6297,7 @@ define amdgpu_kernel void @global_cluster_acquire_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -4947,7 +6310,13 @@ define amdgpu_kernel void @global_cluster_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_cluster_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4964,7 +6333,13 @@ define amdgpu_kernel void @global_cluster_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_cluster_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4983,6 +6358,12 @@ define amdgpu_kernel void @global_cluster_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -5004,7 +6385,13 @@ define amdgpu_kernel void @global_cluster_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5020,7 +6407,13 @@ define amdgpu_kernel void @global_cluster_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_cluster_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5036,7 +6429,13 @@ define amdgpu_kernel void @global_cluster_acquire_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_acquire_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -5052,7 +6451,13 @@ define amdgpu_kernel void @global_cluster_acquire_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_cluster_acquire_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -5068,7 +6473,13 @@ define amdgpu_kernel void @global_cluster_acquire_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_cluster_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5085,7 +6496,13 @@ define amdgpu_kernel void @global_cluster_acquire_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_cluster_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5102,7 +6519,13 @@ define amdgpu_kernel void @global_cluster_acquire_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_cluster_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5118,7 +6541,13 @@ define amdgpu_kernel void @global_cluster_acquire_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_cluster_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5135,7 +6564,13 @@ define amdgpu_kernel void @global_cluster_acquire_acquire_cmpxchg( ; GFX1250-LABEL: global_cluster_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -5161,6 +6596,12 @@ define amdgpu_kernel void @global_cluster_release_acquire_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5188,11 +6629,16 @@ define amdgpu_kernel void @global_cluster_release_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -5202,6 +6648,7 @@ define amdgpu_kernel void @global_cluster_release_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -5215,7 +6662,13 @@ define amdgpu_kernel void @global_cluster_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_cluster_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5234,7 +6687,13 @@ define amdgpu_kernel void @global_cluster_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_cluster_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5255,6 +6714,12 @@ define amdgpu_kernel void @global_cluster_release_acquire_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -5277,7 +6742,13 @@ define amdgpu_kernel void @global_cluster_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5294,7 +6765,13 @@ define amdgpu_kernel void @global_cluster_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_cluster_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5311,7 +6788,13 @@ define amdgpu_kernel void @global_cluster_release_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_release_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -5329,7 +6812,13 @@ define amdgpu_kernel void @global_cluster_release_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_cluster_release_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -5347,7 +6836,13 @@ define amdgpu_kernel void @global_cluster_release_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_cluster_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5366,7 +6861,13 @@ define amdgpu_kernel void @global_cluster_release_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_cluster_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5385,7 +6886,13 @@ define amdgpu_kernel void @global_cluster_release_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_cluster_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5405,7 +6912,13 @@ define amdgpu_kernel void @global_cluster_release_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_cluster_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5426,7 +6939,13 @@ define amdgpu_kernel void @global_cluster_release_acquire_cmpxchg( ; GFX1250-LABEL: global_cluster_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -5454,6 +6973,12 @@ define amdgpu_kernel void @global_cluster_acq_rel_acquire_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5481,11 +7006,16 @@ define amdgpu_kernel void @global_cluster_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -5495,6 +7025,7 @@ define amdgpu_kernel void @global_cluster_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -5508,7 +7039,13 @@ define amdgpu_kernel void @global_cluster_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_cluster_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5527,7 +7064,13 @@ define amdgpu_kernel void @global_cluster_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_cluster_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5548,6 +7091,12 @@ define amdgpu_kernel void @global_cluster_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -5570,7 +7119,13 @@ define amdgpu_kernel void @global_cluster_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5587,7 +7142,13 @@ define amdgpu_kernel void @global_cluster_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_cluster_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5604,7 +7165,13 @@ define amdgpu_kernel void @global_cluster_acq_rel_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_acq_rel_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -5622,7 +7189,13 @@ define amdgpu_kernel void @global_cluster_acq_rel_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_cluster_acq_rel_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -5640,7 +7213,13 @@ define amdgpu_kernel void @global_cluster_acq_rel_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_cluster_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5659,7 +7238,13 @@ define amdgpu_kernel void @global_cluster_acq_rel_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_cluster_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5678,7 +7263,13 @@ define amdgpu_kernel void @global_cluster_acq_rel_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_cluster_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5698,7 +7289,13 @@ define amdgpu_kernel void @global_cluster_acq_rel_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_cluster_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5719,7 +7316,13 @@ define amdgpu_kernel void @global_cluster_acq_rel_acquire_cmpxchg( ; GFX1250-LABEL: global_cluster_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -5747,6 +7350,12 @@ define amdgpu_kernel void @global_cluster_seq_cst_acquire_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5774,11 +7383,16 @@ define amdgpu_kernel void @global_cluster_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -5788,6 +7402,7 @@ define amdgpu_kernel void @global_cluster_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -5801,7 +7416,13 @@ define amdgpu_kernel void @global_cluster_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_cluster_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5820,7 +7441,13 @@ define amdgpu_kernel void @global_cluster_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_cluster_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5841,6 +7468,12 @@ define amdgpu_kernel void @global_cluster_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -5863,7 +7496,13 @@ define amdgpu_kernel void @global_cluster_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5880,7 +7519,13 @@ define amdgpu_kernel void @global_cluster_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_cluster_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5897,7 +7542,13 @@ define amdgpu_kernel void @global_cluster_seq_cst_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_seq_cst_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -5915,7 +7566,13 @@ define amdgpu_kernel void @global_cluster_seq_cst_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_cluster_seq_cst_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -5933,7 +7590,13 @@ define amdgpu_kernel void @global_cluster_seq_cst_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_cluster_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5952,7 +7615,13 @@ define amdgpu_kernel void @global_cluster_seq_cst_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_cluster_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5971,7 +7640,13 @@ define amdgpu_kernel void @global_cluster_seq_cst_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_cluster_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5991,7 +7666,13 @@ define amdgpu_kernel void @global_cluster_seq_cst_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_cluster_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6012,7 +7693,13 @@ define amdgpu_kernel void @global_cluster_seq_cst_acquire_cmpxchg( ; GFX1250-LABEL: global_cluster_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -6040,6 +7727,12 @@ define amdgpu_kernel void @global_cluster_monotonic_seq_cst_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6067,11 +7760,16 @@ define amdgpu_kernel void @global_cluster_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -6081,6 +7779,7 @@ define amdgpu_kernel void @global_cluster_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -6094,7 +7793,13 @@ define amdgpu_kernel void @global_cluster_monotonic_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_cluster_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6113,7 +7818,13 @@ define amdgpu_kernel void @global_cluster_monotonic_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_cluster_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6134,6 +7845,12 @@ define amdgpu_kernel void @global_cluster_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -6156,7 +7873,13 @@ define amdgpu_kernel void @global_cluster_monotonic_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6173,7 +7896,13 @@ define amdgpu_kernel void @global_cluster_monotonic_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_cluster_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6190,7 +7919,13 @@ define amdgpu_kernel void @global_cluster_monotonic_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_monotonic_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -6208,7 +7943,13 @@ define amdgpu_kernel void @global_cluster_monotonic_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_cluster_monotonic_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -6226,7 +7967,13 @@ define amdgpu_kernel void @global_cluster_monotonic_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_cluster_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6245,7 +7992,13 @@ define amdgpu_kernel void @global_cluster_monotonic_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_cluster_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6264,7 +8017,13 @@ define amdgpu_kernel void @global_cluster_monotonic_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_cluster_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6284,7 +8043,13 @@ define amdgpu_kernel void @global_cluster_monotonic_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_cluster_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6305,7 +8070,13 @@ define amdgpu_kernel void @global_cluster_monotonic_seq_cst_cmpxchg( ; GFX1250-LABEL: global_cluster_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -6333,6 +8104,12 @@ define amdgpu_kernel void @global_cluster_acquire_seq_cst_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6360,11 +8137,16 @@ define amdgpu_kernel void @global_cluster_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -6374,6 +8156,7 @@ define amdgpu_kernel void @global_cluster_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -6387,7 +8170,13 @@ define amdgpu_kernel void @global_cluster_acquire_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_cluster_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6406,7 +8195,13 @@ define amdgpu_kernel void @global_cluster_acquire_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_cluster_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6427,6 +8222,12 @@ define amdgpu_kernel void @global_cluster_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -6449,7 +8250,13 @@ define amdgpu_kernel void @global_cluster_acquire_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6466,7 +8273,13 @@ define amdgpu_kernel void @global_cluster_acquire_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_cluster_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6483,7 +8296,13 @@ define amdgpu_kernel void @global_cluster_acquire_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_acquire_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -6501,7 +8320,13 @@ define amdgpu_kernel void @global_cluster_acquire_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_cluster_acquire_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -6519,7 +8344,13 @@ define amdgpu_kernel void @global_cluster_acquire_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_cluster_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6538,7 +8369,13 @@ define amdgpu_kernel void @global_cluster_acquire_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_cluster_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6557,7 +8394,13 @@ define amdgpu_kernel void @global_cluster_acquire_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_cluster_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6577,7 +8420,13 @@ define amdgpu_kernel void @global_cluster_acquire_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_cluster_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6598,7 +8447,13 @@ define amdgpu_kernel void @global_cluster_acquire_seq_cst_cmpxchg( ; GFX1250-LABEL: global_cluster_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -6626,6 +8481,12 @@ define amdgpu_kernel void @global_cluster_release_seq_cst_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6653,11 +8514,16 @@ define amdgpu_kernel void @global_cluster_release_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -6667,6 +8533,7 @@ define amdgpu_kernel void @global_cluster_release_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -6680,7 +8547,13 @@ define amdgpu_kernel void @global_cluster_release_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_cluster_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6699,7 +8572,13 @@ define amdgpu_kernel void @global_cluster_release_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_cluster_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6720,6 +8599,12 @@ define amdgpu_kernel void @global_cluster_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -6742,7 +8627,13 @@ define amdgpu_kernel void @global_cluster_release_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6759,7 +8650,13 @@ define amdgpu_kernel void @global_cluster_release_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_cluster_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6776,7 +8673,13 @@ define amdgpu_kernel void @global_cluster_release_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_release_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -6794,7 +8697,13 @@ define amdgpu_kernel void @global_cluster_release_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_cluster_release_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -6812,7 +8721,13 @@ define amdgpu_kernel void @global_cluster_release_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_cluster_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6831,7 +8746,13 @@ define amdgpu_kernel void @global_cluster_release_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_cluster_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6850,7 +8771,13 @@ define amdgpu_kernel void @global_cluster_release_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_cluster_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6870,7 +8797,13 @@ define amdgpu_kernel void @global_cluster_release_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_cluster_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6891,7 +8824,13 @@ define amdgpu_kernel void @global_cluster_release_seq_cst_cmpxchg( ; GFX1250-LABEL: global_cluster_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -6919,6 +8858,12 @@ define amdgpu_kernel void @global_cluster_acq_rel_seq_cst_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6946,11 +8891,16 @@ define amdgpu_kernel void @global_cluster_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -6960,6 +8910,7 @@ define amdgpu_kernel void @global_cluster_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -6973,7 +8924,13 @@ define amdgpu_kernel void @global_cluster_acq_rel_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_cluster_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6992,7 +8949,13 @@ define amdgpu_kernel void @global_cluster_acq_rel_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_cluster_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7013,6 +8976,12 @@ define amdgpu_kernel void @global_cluster_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -7035,7 +9004,13 @@ define amdgpu_kernel void @global_cluster_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7052,7 +9027,13 @@ define amdgpu_kernel void @global_cluster_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_cluster_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7069,7 +9050,13 @@ define amdgpu_kernel void @global_cluster_acq_rel_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_acq_rel_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -7087,7 +9074,13 @@ define amdgpu_kernel void @global_cluster_acq_rel_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_cluster_acq_rel_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -7105,7 +9098,13 @@ define amdgpu_kernel void @global_cluster_acq_rel_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_cluster_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7124,7 +9123,13 @@ define amdgpu_kernel void @global_cluster_acq_rel_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_cluster_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7143,7 +9148,13 @@ define amdgpu_kernel void @global_cluster_acq_rel_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_cluster_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7163,7 +9174,13 @@ define amdgpu_kernel void @global_cluster_acq_rel_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_cluster_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7184,7 +9201,13 @@ define amdgpu_kernel void @global_cluster_acq_rel_seq_cst_cmpxchg( ; GFX1250-LABEL: global_cluster_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -7212,6 +9235,12 @@ define amdgpu_kernel void @global_cluster_seq_cst_seq_cst_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7239,11 +9268,16 @@ define amdgpu_kernel void @global_cluster_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -7253,6 +9287,7 @@ define amdgpu_kernel void @global_cluster_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -7266,7 +9301,13 @@ define amdgpu_kernel void @global_cluster_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_cluster_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7285,7 +9326,13 @@ define amdgpu_kernel void @global_cluster_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_cluster_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7306,6 +9353,12 @@ define amdgpu_kernel void @global_cluster_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -7328,7 +9381,13 @@ define amdgpu_kernel void @global_cluster_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7345,7 +9404,13 @@ define amdgpu_kernel void @global_cluster_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_cluster_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7362,7 +9427,13 @@ define amdgpu_kernel void @global_cluster_seq_cst_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_seq_cst_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -7380,7 +9451,13 @@ define amdgpu_kernel void @global_cluster_seq_cst_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_cluster_seq_cst_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -7398,7 +9475,13 @@ define amdgpu_kernel void @global_cluster_seq_cst_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_cluster_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7417,7 +9500,13 @@ define amdgpu_kernel void @global_cluster_seq_cst_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_cluster_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7436,7 +9525,13 @@ define amdgpu_kernel void @global_cluster_seq_cst_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_cluster_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7456,7 +9551,13 @@ define amdgpu_kernel void @global_cluster_seq_cst_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_cluster_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7477,7 +9578,13 @@ define amdgpu_kernel void @global_cluster_seq_cst_seq_cst_cmpxchg( ; GFX1250-LABEL: global_cluster_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -7505,6 +9612,12 @@ define amdgpu_kernel void @global_cluster_monotonic_monotonic_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7533,6 +9646,12 @@ define amdgpu_kernel void @global_cluster_monotonic_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -7560,7 +9679,13 @@ define amdgpu_kernel void @global_cluster_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_cluster_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7576,7 +9701,13 @@ define amdgpu_kernel void @global_cluster_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_cluster_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7594,6 +9725,12 @@ define amdgpu_kernel void @global_cluster_monotonic_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -7617,7 +9754,13 @@ define amdgpu_kernel void @global_cluster_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7633,7 +9776,13 @@ define amdgpu_kernel void @global_cluster_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_cluster_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7649,7 +9798,13 @@ define amdgpu_kernel void @global_cluster_monotonic_monotonic_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_monotonic_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -7665,7 +9820,13 @@ define amdgpu_kernel void @global_cluster_monotonic_monotonic_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_cluster_monotonic_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -7681,7 +9842,13 @@ define amdgpu_kernel void @global_cluster_monotonic_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_cluster_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7697,7 +9864,13 @@ define amdgpu_kernel void @global_cluster_monotonic_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_cluster_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7713,7 +9886,13 @@ define amdgpu_kernel void @global_cluster_monotonic_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_cluster_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7729,7 +9908,13 @@ define amdgpu_kernel void @global_cluster_monotonic_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_cluster_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7746,7 +9931,13 @@ define amdgpu_kernel void @global_cluster_monotonic_monotonic_ret_cmpxchg( ; GFX1250-LABEL: global_cluster_monotonic_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -7774,6 +9965,12 @@ define amdgpu_kernel void @global_cluster_acquire_monotonic_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7803,6 +10000,12 @@ define amdgpu_kernel void @global_cluster_acquire_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -7831,7 +10034,13 @@ define amdgpu_kernel void @global_cluster_acquire_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_cluster_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7849,7 +10058,13 @@ define amdgpu_kernel void @global_cluster_acquire_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_cluster_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7869,6 +10084,12 @@ define amdgpu_kernel void @global_cluster_acquire_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -7892,7 +10113,13 @@ define amdgpu_kernel void @global_cluster_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7909,7 +10136,13 @@ define amdgpu_kernel void @global_cluster_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_cluster_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7926,7 +10159,13 @@ define amdgpu_kernel void @global_cluster_acquire_monotonic_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_acquire_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -7943,7 +10182,13 @@ define amdgpu_kernel void @global_cluster_acquire_monotonic_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_cluster_acquire_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -7960,7 +10205,13 @@ define amdgpu_kernel void @global_cluster_acquire_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_cluster_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7978,7 +10229,13 @@ define amdgpu_kernel void @global_cluster_acquire_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_cluster_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7996,7 +10253,13 @@ define amdgpu_kernel void @global_cluster_acquire_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_cluster_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8013,7 +10276,13 @@ define amdgpu_kernel void @global_cluster_acquire_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_cluster_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8031,7 +10300,13 @@ define amdgpu_kernel void @global_cluster_acquire_monotonic_ret_cmpxchg( ; GFX1250-LABEL: global_cluster_acquire_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -8060,6 +10335,12 @@ define amdgpu_kernel void @global_cluster_release_monotonic_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -8089,6 +10370,12 @@ define amdgpu_kernel void @global_cluster_release_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -8117,7 +10404,13 @@ define amdgpu_kernel void @global_cluster_release_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_cluster_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8135,7 +10428,13 @@ define amdgpu_kernel void @global_cluster_release_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_cluster_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8155,6 +10454,12 @@ define amdgpu_kernel void @global_cluster_release_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -8179,7 +10484,13 @@ define amdgpu_kernel void @global_cluster_release_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8196,7 +10507,13 @@ define amdgpu_kernel void @global_cluster_release_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_cluster_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8213,7 +10530,13 @@ define amdgpu_kernel void @global_cluster_release_monotonic_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_release_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -8231,7 +10554,13 @@ define amdgpu_kernel void @global_cluster_release_monotonic_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_cluster_release_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -8249,7 +10578,13 @@ define amdgpu_kernel void @global_cluster_release_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_cluster_release_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8267,7 +10602,13 @@ define amdgpu_kernel void @global_cluster_release_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_cluster_release_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8285,7 +10626,13 @@ define amdgpu_kernel void @global_cluster_release_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_cluster_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8305,7 +10652,13 @@ define amdgpu_kernel void @global_cluster_release_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_cluster_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8326,7 +10679,13 @@ define amdgpu_kernel void @global_cluster_release_monotonic_ret_cmpxchg( ; GFX1250-LABEL: global_cluster_release_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -8356,6 +10715,12 @@ define amdgpu_kernel void @global_cluster_acq_rel_monotonic_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -8386,6 +10751,12 @@ define amdgpu_kernel void @global_cluster_acq_rel_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -8415,7 +10786,13 @@ define amdgpu_kernel void @global_cluster_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_cluster_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8435,7 +10812,13 @@ define amdgpu_kernel void @global_cluster_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_cluster_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8457,6 +10840,12 @@ define amdgpu_kernel void @global_cluster_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -8481,7 +10870,13 @@ define amdgpu_kernel void @global_cluster_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8499,7 +10894,13 @@ define amdgpu_kernel void @global_cluster_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_cluster_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8517,7 +10918,13 @@ define amdgpu_kernel void @global_cluster_acq_rel_monotonic_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_acq_rel_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -8536,7 +10943,13 @@ define amdgpu_kernel void @global_cluster_acq_rel_monotonic_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_cluster_acq_rel_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -8555,7 +10968,13 @@ define amdgpu_kernel void @global_cluster_acq_rel_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_cluster_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8575,7 +10994,13 @@ define amdgpu_kernel void @global_cluster_acq_rel_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_cluster_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8595,7 +11020,13 @@ define amdgpu_kernel void @global_cluster_acq_rel_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_cluster_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8616,7 +11047,13 @@ define amdgpu_kernel void @global_cluster_acq_rel_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_cluster_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8638,7 +11075,13 @@ define amdgpu_kernel void @global_cluster_acq_rel_monotonic_ret_cmpxchg( ; GFX1250-LABEL: global_cluster_acq_rel_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -8669,6 +11112,12 @@ define amdgpu_kernel void @global_cluster_seq_cst_monotonic_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -8699,6 +11148,12 @@ define amdgpu_kernel void @global_cluster_seq_cst_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -8728,7 +11183,13 @@ define amdgpu_kernel void @global_cluster_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_cluster_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8748,7 +11209,13 @@ define amdgpu_kernel void @global_cluster_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_cluster_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8770,6 +11237,12 @@ define amdgpu_kernel void @global_cluster_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -8794,7 +11267,13 @@ define amdgpu_kernel void @global_cluster_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8812,7 +11291,13 @@ define amdgpu_kernel void @global_cluster_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_cluster_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8830,7 +11315,13 @@ define amdgpu_kernel void @global_cluster_seq_cst_monotonic_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_seq_cst_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -8849,7 +11340,13 @@ define amdgpu_kernel void @global_cluster_seq_cst_monotonic_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_cluster_seq_cst_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -8868,7 +11365,13 @@ define amdgpu_kernel void @global_cluster_seq_cst_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_cluster_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8888,7 +11391,13 @@ define amdgpu_kernel void @global_cluster_seq_cst_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_cluster_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8908,7 +11417,13 @@ define amdgpu_kernel void @global_cluster_seq_cst_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_cluster_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8929,7 +11444,13 @@ define amdgpu_kernel void @global_cluster_seq_cst_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_cluster_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8951,7 +11472,13 @@ define amdgpu_kernel void @global_cluster_seq_cst_monotonic_ret_cmpxchg( ; GFX1250-LABEL: global_cluster_seq_cst_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -8982,6 +11509,12 @@ define amdgpu_kernel void @global_cluster_monotonic_acquire_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -9011,6 +11544,12 @@ define amdgpu_kernel void @global_cluster_monotonic_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -9039,7 +11578,13 @@ define amdgpu_kernel void @global_cluster_monotonic_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_cluster_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9057,7 +11602,13 @@ define amdgpu_kernel void @global_cluster_monotonic_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_cluster_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9077,6 +11628,12 @@ define amdgpu_kernel void @global_cluster_monotonic_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -9100,7 +11657,13 @@ define amdgpu_kernel void @global_cluster_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9117,7 +11680,13 @@ define amdgpu_kernel void @global_cluster_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_cluster_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9134,7 +11703,13 @@ define amdgpu_kernel void @global_cluster_monotonic_acquire_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_monotonic_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -9151,7 +11726,13 @@ define amdgpu_kernel void @global_cluster_monotonic_acquire_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_cluster_monotonic_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -9168,7 +11749,13 @@ define amdgpu_kernel void @global_cluster_monotonic_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_cluster_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9186,7 +11773,13 @@ define amdgpu_kernel void @global_cluster_monotonic_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_cluster_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9204,7 +11797,13 @@ define amdgpu_kernel void @global_cluster_monotonic_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_cluster_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9221,7 +11820,13 @@ define amdgpu_kernel void @global_cluster_monotonic_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_cluster_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9239,7 +11844,13 @@ define amdgpu_kernel void @global_cluster_monotonic_acquire_ret_cmpxchg( ; GFX1250-LABEL: global_cluster_monotonic_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -9268,6 +11879,12 @@ define amdgpu_kernel void @global_cluster_acquire_acquire_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -9297,6 +11914,12 @@ define amdgpu_kernel void @global_cluster_acquire_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -9325,7 +11948,13 @@ define amdgpu_kernel void @global_cluster_acquire_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_cluster_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9343,7 +11972,13 @@ define amdgpu_kernel void @global_cluster_acquire_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_cluster_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9363,6 +11998,12 @@ define amdgpu_kernel void @global_cluster_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -9386,7 +12027,13 @@ define amdgpu_kernel void @global_cluster_acquire_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9403,7 +12050,13 @@ define amdgpu_kernel void @global_cluster_acquire_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_cluster_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9420,7 +12073,13 @@ define amdgpu_kernel void @global_cluster_acquire_acquire_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_acquire_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -9437,7 +12096,13 @@ define amdgpu_kernel void @global_cluster_acquire_acquire_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_cluster_acquire_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -9454,7 +12119,13 @@ define amdgpu_kernel void @global_cluster_acquire_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_cluster_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9472,7 +12143,13 @@ define amdgpu_kernel void @global_cluster_acquire_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_cluster_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9490,7 +12167,13 @@ define amdgpu_kernel void @global_cluster_acquire_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_cluster_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9507,7 +12190,13 @@ define amdgpu_kernel void @global_cluster_acquire_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_cluster_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9525,7 +12214,13 @@ define amdgpu_kernel void @global_cluster_acquire_acquire_ret_cmpxchg( ; GFX1250-LABEL: global_cluster_acquire_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -9554,6 +12249,12 @@ define amdgpu_kernel void @global_cluster_release_acquire_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -9584,6 +12285,12 @@ define amdgpu_kernel void @global_cluster_release_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -9613,7 +12320,13 @@ define amdgpu_kernel void @global_cluster_release_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_cluster_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9633,7 +12346,13 @@ define amdgpu_kernel void @global_cluster_release_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_cluster_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9655,6 +12374,12 @@ define amdgpu_kernel void @global_cluster_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -9679,7 +12404,13 @@ define amdgpu_kernel void @global_cluster_release_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9697,7 +12428,13 @@ define amdgpu_kernel void @global_cluster_release_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_cluster_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9715,7 +12452,13 @@ define amdgpu_kernel void @global_cluster_release_acquire_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_release_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -9734,7 +12477,13 @@ define amdgpu_kernel void @global_cluster_release_acquire_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_cluster_release_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -9753,7 +12502,13 @@ define amdgpu_kernel void @global_cluster_release_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_cluster_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9773,7 +12528,13 @@ define amdgpu_kernel void @global_cluster_release_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_cluster_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9793,7 +12554,13 @@ define amdgpu_kernel void @global_cluster_release_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_cluster_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9814,7 +12581,13 @@ define amdgpu_kernel void @global_cluster_release_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_cluster_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9836,7 +12609,13 @@ define amdgpu_kernel void @global_cluster_release_acquire_ret_cmpxchg( ; GFX1250-LABEL: global_cluster_release_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -9867,6 +12646,12 @@ define amdgpu_kernel void @global_cluster_acq_rel_acquire_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -9897,6 +12682,12 @@ define amdgpu_kernel void @global_cluster_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -9926,7 +12717,13 @@ define amdgpu_kernel void @global_cluster_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_cluster_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9946,7 +12743,13 @@ define amdgpu_kernel void @global_cluster_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_cluster_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9968,6 +12771,12 @@ define amdgpu_kernel void @global_cluster_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -9992,7 +12801,13 @@ define amdgpu_kernel void @global_cluster_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10010,7 +12825,13 @@ define amdgpu_kernel void @global_cluster_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_cluster_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10028,7 +12849,13 @@ define amdgpu_kernel void @global_cluster_acq_rel_acquire_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_acq_rel_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -10047,7 +12874,13 @@ define amdgpu_kernel void @global_cluster_acq_rel_acquire_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_cluster_acq_rel_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -10066,7 +12899,13 @@ define amdgpu_kernel void @global_cluster_acq_rel_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_cluster_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10086,7 +12925,13 @@ define amdgpu_kernel void @global_cluster_acq_rel_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_cluster_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10106,7 +12951,13 @@ define amdgpu_kernel void @global_cluster_acq_rel_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_cluster_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10127,7 +12978,13 @@ define amdgpu_kernel void @global_cluster_acq_rel_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_cluster_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10149,7 +13006,13 @@ define amdgpu_kernel void @global_cluster_acq_rel_acquire_ret_cmpxchg( ; GFX1250-LABEL: global_cluster_acq_rel_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -10180,6 +13043,12 @@ define amdgpu_kernel void @global_cluster_seq_cst_acquire_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -10210,6 +13079,12 @@ define amdgpu_kernel void @global_cluster_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -10239,7 +13114,13 @@ define amdgpu_kernel void @global_cluster_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_cluster_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10259,7 +13140,13 @@ define amdgpu_kernel void @global_cluster_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_cluster_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10281,6 +13168,12 @@ define amdgpu_kernel void @global_cluster_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -10305,7 +13198,13 @@ define amdgpu_kernel void @global_cluster_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10323,7 +13222,13 @@ define amdgpu_kernel void @global_cluster_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_cluster_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10341,7 +13246,13 @@ define amdgpu_kernel void @global_cluster_seq_cst_acquire_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_seq_cst_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -10360,7 +13271,13 @@ define amdgpu_kernel void @global_cluster_seq_cst_acquire_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_cluster_seq_cst_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -10379,7 +13296,13 @@ define amdgpu_kernel void @global_cluster_seq_cst_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_cluster_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10399,7 +13322,13 @@ define amdgpu_kernel void @global_cluster_seq_cst_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_cluster_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10419,7 +13348,13 @@ define amdgpu_kernel void @global_cluster_seq_cst_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_cluster_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10440,7 +13375,13 @@ define amdgpu_kernel void @global_cluster_seq_cst_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_cluster_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10462,7 +13403,13 @@ define amdgpu_kernel void @global_cluster_seq_cst_acquire_ret_cmpxchg( ; GFX1250-LABEL: global_cluster_seq_cst_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -10493,6 +13440,12 @@ define amdgpu_kernel void @global_cluster_monotonic_seq_cst_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -10523,6 +13476,12 @@ define amdgpu_kernel void @global_cluster_monotonic_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -10552,7 +13511,13 @@ define amdgpu_kernel void @global_cluster_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_cluster_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10572,7 +13537,13 @@ define amdgpu_kernel void @global_cluster_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_cluster_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10594,6 +13565,12 @@ define amdgpu_kernel void @global_cluster_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -10618,7 +13595,13 @@ define amdgpu_kernel void @global_cluster_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10636,7 +13619,13 @@ define amdgpu_kernel void @global_cluster_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_cluster_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10654,7 +13643,13 @@ define amdgpu_kernel void @global_cluster_monotonic_seq_cst_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_monotonic_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -10673,7 +13668,13 @@ define amdgpu_kernel void @global_cluster_monotonic_seq_cst_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_cluster_monotonic_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -10692,7 +13693,13 @@ define amdgpu_kernel void @global_cluster_monotonic_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_cluster_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10712,7 +13719,13 @@ define amdgpu_kernel void @global_cluster_monotonic_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_cluster_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10732,7 +13745,13 @@ define amdgpu_kernel void @global_cluster_monotonic_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_cluster_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10753,7 +13772,13 @@ define amdgpu_kernel void @global_cluster_monotonic_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_cluster_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10775,7 +13800,13 @@ define amdgpu_kernel void @global_cluster_monotonic_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: global_cluster_monotonic_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -10806,6 +13837,12 @@ define amdgpu_kernel void @global_cluster_acquire_seq_cst_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -10836,6 +13873,12 @@ define amdgpu_kernel void @global_cluster_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -10865,7 +13908,13 @@ define amdgpu_kernel void @global_cluster_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_cluster_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10885,7 +13934,13 @@ define amdgpu_kernel void @global_cluster_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_cluster_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10907,6 +13962,12 @@ define amdgpu_kernel void @global_cluster_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -10931,7 +13992,13 @@ define amdgpu_kernel void @global_cluster_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10949,7 +14016,13 @@ define amdgpu_kernel void @global_cluster_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_cluster_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10967,7 +14040,13 @@ define amdgpu_kernel void @global_cluster_acquire_seq_cst_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_acquire_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -10986,7 +14065,13 @@ define amdgpu_kernel void @global_cluster_acquire_seq_cst_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_cluster_acquire_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -11005,7 +14090,13 @@ define amdgpu_kernel void @global_cluster_acquire_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_cluster_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -11025,7 +14116,13 @@ define amdgpu_kernel void @global_cluster_acquire_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_cluster_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -11045,7 +14142,13 @@ define amdgpu_kernel void @global_cluster_acquire_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_cluster_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -11066,7 +14169,13 @@ define amdgpu_kernel void @global_cluster_acquire_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_cluster_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -11088,7 +14197,13 @@ define amdgpu_kernel void @global_cluster_acquire_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: global_cluster_acquire_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -11119,6 +14234,12 @@ define amdgpu_kernel void @global_cluster_release_seq_cst_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -11149,6 +14270,12 @@ define amdgpu_kernel void @global_cluster_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -11178,7 +14305,13 @@ define amdgpu_kernel void @global_cluster_release_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_cluster_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -11198,7 +14331,13 @@ define amdgpu_kernel void @global_cluster_release_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_cluster_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -11220,6 +14359,12 @@ define amdgpu_kernel void @global_cluster_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -11244,7 +14389,13 @@ define amdgpu_kernel void @global_cluster_release_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -11262,7 +14413,13 @@ define amdgpu_kernel void @global_cluster_release_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_cluster_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -11280,7 +14437,13 @@ define amdgpu_kernel void @global_cluster_release_seq_cst_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_release_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -11299,7 +14462,13 @@ define amdgpu_kernel void @global_cluster_release_seq_cst_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_cluster_release_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -11318,7 +14487,13 @@ define amdgpu_kernel void @global_cluster_release_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_cluster_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -11338,7 +14513,13 @@ define amdgpu_kernel void @global_cluster_release_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_cluster_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -11358,7 +14539,13 @@ define amdgpu_kernel void @global_cluster_release_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_cluster_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -11379,7 +14566,13 @@ define amdgpu_kernel void @global_cluster_release_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_cluster_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -11401,7 +14594,13 @@ define amdgpu_kernel void @global_cluster_release_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: global_cluster_release_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -11432,6 +14631,12 @@ define amdgpu_kernel void @global_cluster_acq_rel_seq_cst_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -11462,6 +14667,12 @@ define amdgpu_kernel void @global_cluster_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -11491,7 +14702,13 @@ define amdgpu_kernel void @global_cluster_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_cluster_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -11511,7 +14728,13 @@ define amdgpu_kernel void @global_cluster_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_cluster_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -11533,6 +14756,12 @@ define amdgpu_kernel void @global_cluster_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -11557,7 +14786,13 @@ define amdgpu_kernel void @global_cluster_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -11575,7 +14810,13 @@ define amdgpu_kernel void @global_cluster_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_cluster_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -11593,7 +14834,13 @@ define amdgpu_kernel void @global_cluster_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_acq_rel_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -11612,7 +14859,13 @@ define amdgpu_kernel void @global_cluster_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_cluster_acq_rel_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -11631,7 +14884,13 @@ define amdgpu_kernel void @global_cluster_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_cluster_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -11651,7 +14910,13 @@ define amdgpu_kernel void @global_cluster_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_cluster_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -11671,7 +14936,13 @@ define amdgpu_kernel void @global_cluster_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_cluster_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -11692,7 +14963,13 @@ define amdgpu_kernel void @global_cluster_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_cluster_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -11714,7 +14991,13 @@ define amdgpu_kernel void @global_cluster_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: global_cluster_acq_rel_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -11745,6 +15028,12 @@ define amdgpu_kernel void @global_cluster_seq_cst_seq_cst_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -11775,6 +15064,12 @@ define amdgpu_kernel void @global_cluster_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -11804,7 +15099,13 @@ define amdgpu_kernel void @global_cluster_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_cluster_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -11824,7 +15125,13 @@ define amdgpu_kernel void @global_cluster_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_cluster_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -11846,6 +15153,12 @@ define amdgpu_kernel void @global_cluster_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -11870,7 +15183,13 @@ define amdgpu_kernel void @global_cluster_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -11888,7 +15207,13 @@ define amdgpu_kernel void @global_cluster_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_cluster_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -11906,7 +15231,13 @@ define amdgpu_kernel void @global_cluster_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_seq_cst_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -11925,7 +15256,13 @@ define amdgpu_kernel void @global_cluster_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_cluster_seq_cst_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -11944,7 +15281,13 @@ define amdgpu_kernel void @global_cluster_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_cluster_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -11964,7 +15307,13 @@ define amdgpu_kernel void @global_cluster_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_cluster_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -11984,7 +15333,13 @@ define amdgpu_kernel void @global_cluster_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_cluster_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -12005,7 +15360,13 @@ define amdgpu_kernel void @global_cluster_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_cluster_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -12027,7 +15388,13 @@ define amdgpu_kernel void @global_cluster_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: global_cluster_seq_cst_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -12057,6 +15424,9 @@ define amdgpu_kernel void @global_cluster_one_as_unordered_load( ; GFX6-LABEL: global_cluster_one_as_unordered_load: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -12084,12 +15454,16 @@ define amdgpu_kernel void @global_cluster_one_as_unordered_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -12098,29 +15472,38 @@ define amdgpu_kernel void @global_cluster_one_as_unordered_load( ; ; GFX10-WGP-LABEL: global_cluster_one_as_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_cluster_one_as_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_cluster_one_as_unordered_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -12145,101 +15528,131 @@ define amdgpu_kernel void @global_cluster_one_as_unordered_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_unordered_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_cluster_one_as_unordered_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_cluster_one_as_unordered_load: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_cluster_one_as_unordered_load: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_cluster_one_as_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_cluster_one_as_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_cluster_one_as_unordered_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -12253,6 +15666,9 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_load( ; GFX6-LABEL: global_cluster_one_as_monotonic_load: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -12280,12 +15696,16 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] glc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -12294,29 +15714,38 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_load( ; ; GFX10-WGP-LABEL: global_cluster_one_as_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] glc dlc -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_cluster_one_as_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] glc dlc -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_cluster_one_as_monotonic_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -12341,101 +15770,131 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_monotonic_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_cluster_one_as_monotonic_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc1 -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_cluster_one_as_monotonic_load: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] glc -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_cluster_one_as_monotonic_load: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] glc -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_cluster_one_as_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_cluster_one_as_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_cluster_one_as_monotonic_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SE ; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -12449,6 +15908,9 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_load( ; GFX6-LABEL: global_cluster_one_as_acquire_load: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -12477,14 +15939,18 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: flat_store_dword v[0:1], v2 @@ -12492,33 +15958,44 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_load( ; ; GFX10-WGP-LABEL: global_cluster_one_as_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_cluster_one_as_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_cluster_one_as_acquire_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -12543,112 +16020,148 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_acquire_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc1 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_cluster_one_as_acquire_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_cluster_one_as_acquire_load: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_cluster_one_as_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_cluster_one_as_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_cluster_one_as_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_cluster_one_as_acquire_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SE ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -12662,6 +16175,9 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_load( ; GFX6-LABEL: global_cluster_one_as_seq_cst_load: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -12691,15 +16207,19 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_load_dword v2, v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: flat_store_dword v[0:1], v2 @@ -12707,35 +16227,48 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_load( ; ; GFX10-WGP-LABEL: global_cluster_one_as_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_cluster_one_as_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_cluster_one_as_seq_cst_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -12761,124 +16294,166 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_seq_cst_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc1 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_cluster_one_as_seq_cst_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_cluster_one_as_seq_cst_load: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_cluster_one_as_seq_cst_load: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_cluster_one_as_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_cluster_one_as_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_cluster_one_as_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SE ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -12892,6 +16467,9 @@ define amdgpu_kernel void @global_cluster_one_as_unordered_store( ; GFX6-LABEL: global_cluster_one_as_unordered_store: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -12913,6 +16491,10 @@ define amdgpu_kernel void @global_cluster_one_as_unordered_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -12923,27 +16505,38 @@ define amdgpu_kernel void @global_cluster_one_as_unordered_store( ; ; GFX10-WGP-LABEL: global_cluster_one_as_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_cluster_one_as_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_cluster_one_as_unordered_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -12961,93 +16554,129 @@ define amdgpu_kernel void @global_cluster_one_as_unordered_store( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_unordered_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_cluster_one_as_unordered_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_cluster_one_as_unordered_store: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_cluster_one_as_unordered_store: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_cluster_one_as_unordered_store: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_cluster_one_as_unordered_store: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_cluster_one_as_unordered_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -13060,6 +16689,9 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_store( ; GFX6-LABEL: global_cluster_one_as_monotonic_store: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -13081,6 +16713,10 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -13091,27 +16727,38 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_store( ; ; GFX10-WGP-LABEL: global_cluster_one_as_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_cluster_one_as_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_cluster_one_as_monotonic_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -13129,93 +16776,129 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_store( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_monotonic_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_cluster_one_as_monotonic_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_cluster_one_as_monotonic_store: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_cluster_one_as_monotonic_store: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_cluster_one_as_monotonic_store: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_cluster_one_as_monotonic_store: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_cluster_one_as_monotonic_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -13228,6 +16911,9 @@ define amdgpu_kernel void @global_cluster_one_as_release_store( ; GFX6-LABEL: global_cluster_one_as_release_store: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -13250,6 +16936,10 @@ define amdgpu_kernel void @global_cluster_one_as_release_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -13261,24 +16951,30 @@ define amdgpu_kernel void @global_cluster_one_as_release_store( ; ; GFX10-WGP-LABEL: global_cluster_one_as_release_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_cluster_one_as_release_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm @@ -13286,6 +16982,9 @@ define amdgpu_kernel void @global_cluster_one_as_release_store( ; SKIP-CACHE-INV-LABEL: global_cluster_one_as_release_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -13304,113 +17003,143 @@ define amdgpu_kernel void @global_cluster_one_as_release_store( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_release_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_cluster_one_as_release_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_cluster_one_as_release_store: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_cluster_one_as_release_store: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_cluster_one_as_release_store: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_cluster_one_as_release_store: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_cluster_one_as_release_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -13423,6 +17152,9 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_store( ; GFX6-LABEL: global_cluster_one_as_seq_cst_store: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -13445,6 +17177,10 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -13456,24 +17192,30 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_store( ; ; GFX10-WGP-LABEL: global_cluster_one_as_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_cluster_one_as_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm @@ -13481,6 +17223,9 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_store( ; SKIP-CACHE-INV-LABEL: global_cluster_one_as_seq_cst_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -13499,113 +17244,143 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_store( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_seq_cst_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_cluster_one_as_seq_cst_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_cluster_one_as_seq_cst_store: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_cluster_one_as_seq_cst_store: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_cluster_one_as_seq_cst_store: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_cluster_one_as_seq_cst_store: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_cluster_one_as_seq_cst_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -13618,6 +17393,10 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_atomicrmw( ; GFX6-LABEL: global_cluster_one_as_monotonic_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -13637,18 +17416,26 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_cluster_one_as_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -13658,7 +17445,11 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_atomicrmw( ; ; GFX10-CU-LABEL: global_cluster_one_as_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -13669,6 +17460,10 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_cluster_one_as_monotonic_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -13685,7 +17480,11 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13695,7 +17494,11 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13705,7 +17508,11 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_monotonic_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13715,7 +17522,11 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_cluster_one_as_monotonic_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13725,7 +17536,11 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_atomicrmw( ; ; GFX11-WGP-LABEL: global_cluster_one_as_monotonic_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -13735,7 +17550,11 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_atomicrmw( ; ; GFX11-CU-LABEL: global_cluster_one_as_monotonic_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -13745,7 +17564,11 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_atomicrmw( ; ; GFX12-WGP-LABEL: global_cluster_one_as_monotonic_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -13755,7 +17578,11 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_atomicrmw( ; ; GFX12-CU-LABEL: global_cluster_one_as_monotonic_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -13766,7 +17593,11 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_atomicrmw( ; GFX1250-LABEL: global_cluster_one_as_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -13784,6 +17615,10 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_atomicrmw( ; GFX6-LABEL: global_cluster_one_as_acquire_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -13805,11 +17640,15 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -13818,7 +17657,11 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_atomicrmw( ; ; GFX10-WGP-LABEL: global_cluster_one_as_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -13831,7 +17674,11 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_atomicrmw( ; ; GFX10-CU-LABEL: global_cluster_one_as_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -13845,6 +17692,10 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_cluster_one_as_acquire_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -13862,7 +17713,11 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13874,7 +17729,11 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13886,7 +17745,11 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_acquire_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13898,7 +17761,11 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_cluster_one_as_acquire_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13910,7 +17777,11 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_atomicrmw( ; ; GFX11-WGP-LABEL: global_cluster_one_as_acquire_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -13923,7 +17794,11 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_atomicrmw( ; ; GFX11-CU-LABEL: global_cluster_one_as_acquire_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -13936,7 +17811,11 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_atomicrmw( ; ; GFX12-WGP-LABEL: global_cluster_one_as_acquire_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -13948,7 +17827,11 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_atomicrmw( ; ; GFX12-CU-LABEL: global_cluster_one_as_acquire_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -13961,7 +17844,11 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_atomicrmw( ; GFX1250-LABEL: global_cluster_one_as_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -13981,6 +17868,10 @@ define amdgpu_kernel void @global_cluster_one_as_release_atomicrmw( ; GFX6-LABEL: global_cluster_one_as_release_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -14001,11 +17892,15 @@ define amdgpu_kernel void @global_cluster_one_as_release_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 @@ -14013,7 +17908,11 @@ define amdgpu_kernel void @global_cluster_one_as_release_atomicrmw( ; ; GFX10-WGP-LABEL: global_cluster_one_as_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -14025,7 +17924,11 @@ define amdgpu_kernel void @global_cluster_one_as_release_atomicrmw( ; ; GFX10-CU-LABEL: global_cluster_one_as_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -14038,6 +17941,10 @@ define amdgpu_kernel void @global_cluster_one_as_release_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_cluster_one_as_release_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -14055,7 +17962,11 @@ define amdgpu_kernel void @global_cluster_one_as_release_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14066,7 +17977,11 @@ define amdgpu_kernel void @global_cluster_one_as_release_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14077,7 +17992,11 @@ define amdgpu_kernel void @global_cluster_one_as_release_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_release_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14089,7 +18008,11 @@ define amdgpu_kernel void @global_cluster_one_as_release_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_cluster_one_as_release_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14101,7 +18024,11 @@ define amdgpu_kernel void @global_cluster_one_as_release_atomicrmw( ; ; GFX11-WGP-LABEL: global_cluster_one_as_release_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -14113,7 +18040,11 @@ define amdgpu_kernel void @global_cluster_one_as_release_atomicrmw( ; ; GFX11-CU-LABEL: global_cluster_one_as_release_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -14125,7 +18056,11 @@ define amdgpu_kernel void @global_cluster_one_as_release_atomicrmw( ; ; GFX12-WGP-LABEL: global_cluster_one_as_release_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -14139,7 +18074,11 @@ define amdgpu_kernel void @global_cluster_one_as_release_atomicrmw( ; ; GFX12-CU-LABEL: global_cluster_one_as_release_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -14154,7 +18093,11 @@ define amdgpu_kernel void @global_cluster_one_as_release_atomicrmw( ; GFX1250-LABEL: global_cluster_one_as_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -14174,6 +18117,10 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_atomicrmw( ; GFX6-LABEL: global_cluster_one_as_acq_rel_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -14196,11 +18143,15 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 @@ -14210,7 +18161,11 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_atomicrmw( ; ; GFX10-WGP-LABEL: global_cluster_one_as_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -14225,7 +18180,11 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_atomicrmw( ; ; GFX10-CU-LABEL: global_cluster_one_as_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -14241,6 +18200,10 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_cluster_one_as_acq_rel_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -14259,7 +18222,11 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14272,7 +18239,11 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14285,7 +18256,11 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_acq_rel_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14299,7 +18274,11 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_cluster_one_as_acq_rel_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14313,7 +18292,11 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_atomicrmw( ; ; GFX11-WGP-LABEL: global_cluster_one_as_acq_rel_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -14328,7 +18311,11 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_atomicrmw( ; ; GFX11-CU-LABEL: global_cluster_one_as_acq_rel_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -14343,7 +18330,11 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_atomicrmw( ; ; GFX12-WGP-LABEL: global_cluster_one_as_acq_rel_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -14359,7 +18350,11 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_atomicrmw( ; ; GFX12-CU-LABEL: global_cluster_one_as_acq_rel_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -14376,7 +18371,11 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_atomicrmw( ; GFX1250-LABEL: global_cluster_one_as_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -14398,6 +18397,10 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_atomicrmw( ; GFX6-LABEL: global_cluster_one_as_seq_cst_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -14420,11 +18423,15 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 @@ -14434,7 +18441,11 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_atomicrmw( ; ; GFX10-WGP-LABEL: global_cluster_one_as_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -14449,7 +18460,11 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_atomicrmw( ; ; GFX10-CU-LABEL: global_cluster_one_as_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -14465,6 +18480,10 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_cluster_one_as_seq_cst_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -14483,7 +18502,11 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14496,7 +18519,11 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14509,7 +18536,11 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_seq_cst_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14523,7 +18554,11 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_cluster_one_as_seq_cst_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14537,7 +18572,11 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_atomicrmw( ; ; GFX11-WGP-LABEL: global_cluster_one_as_seq_cst_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -14552,7 +18591,11 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_atomicrmw( ; ; GFX11-CU-LABEL: global_cluster_one_as_seq_cst_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -14567,7 +18610,11 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_atomicrmw( ; ; GFX12-WGP-LABEL: global_cluster_one_as_seq_cst_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -14583,7 +18630,11 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_atomicrmw( ; ; GFX12-CU-LABEL: global_cluster_one_as_seq_cst_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -14600,7 +18651,11 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_atomicrmw( ; GFX1250-LABEL: global_cluster_one_as_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -14622,6 +18677,10 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_ret_atomicrmw( ; GFX6-LABEL: global_cluster_one_as_acquire_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -14645,6 +18704,10 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_ret_atomicrmw( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -14660,7 +18723,11 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_ret_atomicrmw( ; ; GFX10-WGP-LABEL: global_cluster_one_as_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -14674,7 +18741,11 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_ret_atomicrmw( ; ; GFX10-CU-LABEL: global_cluster_one_as_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -14689,6 +18760,10 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_cluster_one_as_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -14707,7 +18782,11 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14720,7 +18799,11 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14733,7 +18816,11 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_ret_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_acquire_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14746,7 +18833,11 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_ret_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_cluster_one_as_acquire_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14759,7 +18850,11 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_ret_atomicrmw( ; ; GFX11-WGP-LABEL: global_cluster_one_as_acquire_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -14773,7 +18868,11 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_ret_atomicrmw( ; ; GFX11-CU-LABEL: global_cluster_one_as_acquire_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -14787,7 +18886,11 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_ret_atomicrmw( ; ; GFX12-WGP-LABEL: global_cluster_one_as_acquire_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -14800,7 +18903,11 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_ret_atomicrmw( ; ; GFX12-CU-LABEL: global_cluster_one_as_acquire_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -14814,7 +18921,11 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_ret_atomicrmw( ; GFX1250-LABEL: global_cluster_one_as_acquire_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -14836,6 +18947,10 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_ret_atomicrmw( ; GFX6-LABEL: global_cluster_one_as_acq_rel_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -14860,6 +18975,10 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_ret_atomicrmw( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -14876,7 +18995,11 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_ret_atomicrmw( ; ; GFX10-WGP-LABEL: global_cluster_one_as_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -14892,7 +19015,11 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_ret_atomicrmw( ; ; GFX10-CU-LABEL: global_cluster_one_as_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -14909,6 +19036,10 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_cluster_one_as_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -14928,7 +19059,11 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14942,7 +19077,11 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14956,7 +19095,11 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_ret_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_acq_rel_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14971,7 +19114,11 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_ret_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_cluster_one_as_acq_rel_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14986,7 +19133,11 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_ret_atomicrmw( ; ; GFX11-WGP-LABEL: global_cluster_one_as_acq_rel_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -15002,7 +19153,11 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_ret_atomicrmw( ; ; GFX11-CU-LABEL: global_cluster_one_as_acq_rel_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -15018,7 +19173,11 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_ret_atomicrmw( ; ; GFX12-WGP-LABEL: global_cluster_one_as_acq_rel_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -15035,7 +19194,11 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_ret_atomicrmw( ; ; GFX12-CU-LABEL: global_cluster_one_as_acq_rel_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -15053,7 +19216,11 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_ret_atomicrmw( ; GFX1250-LABEL: global_cluster_one_as_acq_rel_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -15077,6 +19244,10 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_ret_atomicrmw( ; GFX6-LABEL: global_cluster_one_as_seq_cst_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -15101,6 +19272,10 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_ret_atomicrmw( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -15117,7 +19292,11 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_ret_atomicrmw( ; ; GFX10-WGP-LABEL: global_cluster_one_as_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -15133,7 +19312,11 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_ret_atomicrmw( ; ; GFX10-CU-LABEL: global_cluster_one_as_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -15150,6 +19333,10 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_cluster_one_as_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -15169,7 +19356,11 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15183,7 +19374,11 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15197,7 +19392,11 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_ret_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_seq_cst_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15212,7 +19411,11 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_ret_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_cluster_one_as_seq_cst_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15227,7 +19430,11 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_ret_atomicrmw( ; ; GFX11-WGP-LABEL: global_cluster_one_as_seq_cst_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -15243,7 +19450,11 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_ret_atomicrmw( ; ; GFX11-CU-LABEL: global_cluster_one_as_seq_cst_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -15259,7 +19470,11 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_ret_atomicrmw( ; ; GFX12-WGP-LABEL: global_cluster_one_as_seq_cst_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -15276,7 +19491,11 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_ret_atomicrmw( ; ; GFX12-CU-LABEL: global_cluster_one_as_seq_cst_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -15294,7 +19513,11 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_ret_atomicrmw( ; GFX1250-LABEL: global_cluster_one_as_seq_cst_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -15319,6 +19542,12 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_monotonic_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15343,11 +19572,16 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -15357,6 +19591,7 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -15367,7 +19602,13 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_cluster_one_as_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15381,7 +19622,13 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_cluster_one_as_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15397,6 +19644,12 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_monotonic_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -15417,7 +19670,13 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15431,7 +19690,13 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15445,7 +19710,13 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_monotonic_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -15459,7 +19730,13 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_cluster_one_as_monotonic_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -15473,7 +19750,13 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_cluster_one_as_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15487,7 +19770,13 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_cluster_one_as_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15501,7 +19790,13 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_cluster_one_as_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15515,7 +19810,13 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_cluster_one_as_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15530,7 +19831,13 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_monotonic_cmpxchg( ; GFX1250-LABEL: global_cluster_one_as_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -15554,6 +19861,12 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_monotonic_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15580,11 +19893,16 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -15594,6 +19912,7 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -15606,7 +19925,13 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_cluster_one_as_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15623,7 +19948,13 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_cluster_one_as_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15642,6 +19973,12 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -15663,7 +20000,13 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15679,7 +20022,13 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15695,7 +20044,13 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_acquire_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -15711,7 +20066,13 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_cluster_one_as_acquire_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -15727,7 +20088,13 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_cluster_one_as_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15744,7 +20111,13 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_cluster_one_as_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15761,7 +20134,13 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_cluster_one_as_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15777,7 +20156,13 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_cluster_one_as_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15794,7 +20179,13 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_monotonic_cmpxchg( ; GFX1250-LABEL: global_cluster_one_as_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -15820,6 +20211,12 @@ define amdgpu_kernel void @global_cluster_one_as_release_monotonic_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15845,11 +20242,16 @@ define amdgpu_kernel void @global_cluster_one_as_release_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -15859,6 +20261,7 @@ define amdgpu_kernel void @global_cluster_one_as_release_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -15870,7 +20273,13 @@ define amdgpu_kernel void @global_cluster_one_as_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_cluster_one_as_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15886,7 +20295,13 @@ define amdgpu_kernel void @global_cluster_one_as_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_cluster_one_as_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15904,6 +20319,12 @@ define amdgpu_kernel void @global_cluster_one_as_release_monotonic_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -15925,7 +20346,13 @@ define amdgpu_kernel void @global_cluster_one_as_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15940,7 +20367,13 @@ define amdgpu_kernel void @global_cluster_one_as_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15955,7 +20388,13 @@ define amdgpu_kernel void @global_cluster_one_as_release_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_release_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -15971,7 +20410,13 @@ define amdgpu_kernel void @global_cluster_one_as_release_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_cluster_one_as_release_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -15987,7 +20432,13 @@ define amdgpu_kernel void @global_cluster_one_as_release_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_cluster_one_as_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16003,7 +20454,13 @@ define amdgpu_kernel void @global_cluster_one_as_release_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_cluster_one_as_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16019,7 +20476,13 @@ define amdgpu_kernel void @global_cluster_one_as_release_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_cluster_one_as_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16037,7 +20500,13 @@ define amdgpu_kernel void @global_cluster_one_as_release_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_cluster_one_as_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16056,7 +20525,13 @@ define amdgpu_kernel void @global_cluster_one_as_release_monotonic_cmpxchg( ; GFX1250-LABEL: global_cluster_one_as_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -16082,6 +20557,12 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_monotonic_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -16109,11 +20590,16 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -16123,6 +20609,7 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -16136,7 +20623,13 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_cluster_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16155,7 +20648,13 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_cluster_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16176,6 +20675,12 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16198,7 +20703,13 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16215,7 +20726,13 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16232,7 +20749,13 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_acq_rel_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -16250,7 +20773,13 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_cluster_one_as_acq_rel_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -16268,7 +20797,13 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_cluster_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16287,7 +20822,13 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_cluster_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16306,7 +20847,13 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_cluster_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16326,7 +20873,13 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_cluster_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16347,7 +20900,13 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_monotonic_cmpxchg( ; GFX1250-LABEL: global_cluster_one_as_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -16375,6 +20934,12 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_monotonic_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -16402,11 +20967,16 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -16416,6 +20986,7 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -16429,7 +21000,13 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_cluster_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16448,7 +21025,13 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_cluster_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16469,6 +21052,12 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16491,7 +21080,13 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16508,7 +21103,13 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16525,7 +21126,13 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_seq_cst_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -16543,7 +21150,13 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_cluster_one_as_seq_cst_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -16561,7 +21174,13 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_cluster_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16580,7 +21199,13 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_cluster_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16599,7 +21224,13 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_cluster_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16619,7 +21250,13 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_cluster_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16640,7 +21277,13 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_monotonic_cmpxchg( ; GFX1250-LABEL: global_cluster_one_as_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -16668,6 +21311,12 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_acquire_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -16694,11 +21343,16 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -16708,6 +21362,7 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -16720,7 +21375,13 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_cluster_one_as_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16737,7 +21398,13 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_cluster_one_as_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16756,6 +21423,12 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16777,7 +21450,13 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16793,7 +21472,13 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16809,7 +21494,13 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_monotonic_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -16825,7 +21516,13 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_cluster_one_as_monotonic_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -16841,7 +21538,13 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_cluster_one_as_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16858,7 +21561,13 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_cluster_one_as_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16875,7 +21584,13 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_cluster_one_as_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16891,7 +21606,13 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_cluster_one_as_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16908,7 +21629,13 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_acquire_cmpxchg( ; GFX1250-LABEL: global_cluster_one_as_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -16934,6 +21661,12 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_acquire_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -16960,11 +21693,16 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -16974,6 +21712,7 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -16986,7 +21725,13 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_cluster_one_as_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17003,7 +21748,13 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_cluster_one_as_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17022,6 +21773,12 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -17043,7 +21800,13 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17059,7 +21822,13 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17075,7 +21844,13 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_acquire_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -17091,7 +21866,13 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_cluster_one_as_acquire_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -17107,7 +21888,13 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_cluster_one_as_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17124,7 +21911,13 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_cluster_one_as_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17141,7 +21934,13 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_cluster_one_as_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17157,7 +21956,13 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_cluster_one_as_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17174,7 +21979,13 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_acquire_cmpxchg( ; GFX1250-LABEL: global_cluster_one_as_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -17200,6 +22011,12 @@ define amdgpu_kernel void @global_cluster_one_as_release_acquire_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -17227,11 +22044,16 @@ define amdgpu_kernel void @global_cluster_one_as_release_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -17241,6 +22063,7 @@ define amdgpu_kernel void @global_cluster_one_as_release_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -17254,7 +22077,13 @@ define amdgpu_kernel void @global_cluster_one_as_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_cluster_one_as_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17273,7 +22102,13 @@ define amdgpu_kernel void @global_cluster_one_as_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_cluster_one_as_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17294,6 +22129,12 @@ define amdgpu_kernel void @global_cluster_one_as_release_acquire_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -17316,7 +22157,13 @@ define amdgpu_kernel void @global_cluster_one_as_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17333,7 +22180,13 @@ define amdgpu_kernel void @global_cluster_one_as_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17350,7 +22203,13 @@ define amdgpu_kernel void @global_cluster_one_as_release_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_release_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -17368,7 +22227,13 @@ define amdgpu_kernel void @global_cluster_one_as_release_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_cluster_one_as_release_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -17386,7 +22251,13 @@ define amdgpu_kernel void @global_cluster_one_as_release_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_cluster_one_as_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17405,7 +22276,13 @@ define amdgpu_kernel void @global_cluster_one_as_release_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_cluster_one_as_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17424,7 +22301,13 @@ define amdgpu_kernel void @global_cluster_one_as_release_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_cluster_one_as_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17444,7 +22327,13 @@ define amdgpu_kernel void @global_cluster_one_as_release_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_cluster_one_as_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17465,7 +22354,13 @@ define amdgpu_kernel void @global_cluster_one_as_release_acquire_cmpxchg( ; GFX1250-LABEL: global_cluster_one_as_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -17493,6 +22388,12 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_acquire_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -17520,11 +22421,16 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -17534,6 +22440,7 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -17547,7 +22454,13 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_cluster_one_as_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17566,7 +22479,13 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_cluster_one_as_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17587,6 +22506,12 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -17609,7 +22534,13 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17626,7 +22557,13 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17643,7 +22580,13 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_acq_rel_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -17661,7 +22604,13 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_cluster_one_as_acq_rel_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -17679,7 +22628,13 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_cluster_one_as_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17698,7 +22653,13 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_cluster_one_as_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17717,7 +22678,13 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_cluster_one_as_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17737,7 +22704,13 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_cluster_one_as_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17758,7 +22731,13 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_acquire_cmpxchg( ; GFX1250-LABEL: global_cluster_one_as_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -17786,6 +22765,12 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_acquire_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -17813,11 +22798,16 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -17827,6 +22817,7 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -17840,7 +22831,13 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_cluster_one_as_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17859,7 +22856,13 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_cluster_one_as_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17880,6 +22883,12 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -17902,7 +22911,13 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17919,7 +22934,13 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17936,7 +22957,13 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_seq_cst_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -17954,7 +22981,13 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_cluster_one_as_seq_cst_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -17972,7 +23005,13 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_cluster_one_as_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17991,7 +23030,13 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_cluster_one_as_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18010,7 +23055,13 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_cluster_one_as_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18030,7 +23081,13 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_cluster_one_as_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18051,7 +23108,13 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_acquire_cmpxchg( ; GFX1250-LABEL: global_cluster_one_as_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -18079,6 +23142,12 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_seq_cst_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -18106,11 +23175,16 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -18120,6 +23194,7 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -18133,7 +23208,13 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_cluster_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18152,7 +23233,13 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_cluster_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18173,6 +23260,12 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -18195,7 +23288,13 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18212,7 +23311,13 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18229,7 +23334,13 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_monotonic_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -18247,7 +23358,13 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_cluster_one_as_monotonic_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -18265,7 +23382,13 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_cluster_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18284,7 +23407,13 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_cluster_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18303,7 +23432,13 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_cluster_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18323,7 +23458,13 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_cluster_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18344,7 +23485,13 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_seq_cst_cmpxchg( ; GFX1250-LABEL: global_cluster_one_as_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -18372,6 +23519,12 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_seq_cst_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -18399,11 +23552,16 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -18413,6 +23571,7 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -18426,7 +23585,13 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_cluster_one_as_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18445,7 +23610,13 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_cluster_one_as_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18466,6 +23637,12 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -18488,7 +23665,13 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18505,7 +23688,13 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18522,7 +23711,13 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_acquire_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -18540,7 +23735,13 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_cluster_one_as_acquire_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -18558,7 +23759,13 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_cluster_one_as_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18577,7 +23784,13 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_cluster_one_as_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18596,7 +23809,13 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_cluster_one_as_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18616,7 +23835,13 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_cluster_one_as_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18637,7 +23862,13 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_seq_cst_cmpxchg( ; GFX1250-LABEL: global_cluster_one_as_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -18665,6 +23896,12 @@ define amdgpu_kernel void @global_cluster_one_as_release_seq_cst_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -18692,11 +23929,16 @@ define amdgpu_kernel void @global_cluster_one_as_release_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -18706,6 +23948,7 @@ define amdgpu_kernel void @global_cluster_one_as_release_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -18719,7 +23962,13 @@ define amdgpu_kernel void @global_cluster_one_as_release_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_cluster_one_as_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18738,7 +23987,13 @@ define amdgpu_kernel void @global_cluster_one_as_release_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_cluster_one_as_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18759,6 +24014,12 @@ define amdgpu_kernel void @global_cluster_one_as_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -18781,7 +24042,13 @@ define amdgpu_kernel void @global_cluster_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18798,7 +24065,13 @@ define amdgpu_kernel void @global_cluster_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18815,7 +24088,13 @@ define amdgpu_kernel void @global_cluster_one_as_release_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_release_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -18833,7 +24112,13 @@ define amdgpu_kernel void @global_cluster_one_as_release_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_cluster_one_as_release_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -18851,7 +24136,13 @@ define amdgpu_kernel void @global_cluster_one_as_release_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_cluster_one_as_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18870,7 +24161,13 @@ define amdgpu_kernel void @global_cluster_one_as_release_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_cluster_one_as_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18889,7 +24186,13 @@ define amdgpu_kernel void @global_cluster_one_as_release_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_cluster_one_as_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18909,7 +24212,13 @@ define amdgpu_kernel void @global_cluster_one_as_release_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_cluster_one_as_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18930,7 +24239,13 @@ define amdgpu_kernel void @global_cluster_one_as_release_seq_cst_cmpxchg( ; GFX1250-LABEL: global_cluster_one_as_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -18958,6 +24273,12 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_seq_cst_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -18985,11 +24306,16 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -18999,6 +24325,7 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -19012,7 +24339,13 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_cluster_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19031,7 +24364,13 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_cluster_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19052,6 +24391,12 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -19074,7 +24419,13 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19091,7 +24442,13 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19108,7 +24465,13 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_acq_rel_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -19126,7 +24489,13 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_cluster_one_as_acq_rel_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -19144,7 +24513,13 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_cluster_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19163,7 +24538,13 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_cluster_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19182,7 +24563,13 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_cluster_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19202,7 +24589,13 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_cluster_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19223,7 +24616,13 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_seq_cst_cmpxchg( ; GFX1250-LABEL: global_cluster_one_as_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -19251,6 +24650,12 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_seq_cst_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -19278,11 +24683,16 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -19292,6 +24702,7 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -19305,7 +24716,13 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_cluster_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19324,7 +24741,13 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_cluster_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19345,6 +24768,12 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -19367,7 +24796,13 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19384,7 +24819,13 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19401,7 +24842,13 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_seq_cst_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -19419,7 +24866,13 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_cluster_one_as_seq_cst_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -19437,7 +24890,13 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_cluster_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19456,7 +24915,13 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_cluster_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19475,7 +24940,13 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_cluster_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19495,7 +24966,13 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_cluster_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19516,7 +24993,13 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_seq_cst_cmpxchg( ; GFX1250-LABEL: global_cluster_one_as_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -19544,6 +25027,12 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_monotonic_ret_cmpxchg ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -19572,6 +25061,12 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_monotonic_ret_cmpxchg ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -19599,7 +25094,13 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_monotonic_ret_cmpxchg ; ; GFX10-WGP-LABEL: global_cluster_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19615,7 +25116,13 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_monotonic_ret_cmpxchg ; ; GFX10-CU-LABEL: global_cluster_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19633,6 +25140,12 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_monotonic_ret_cmpxchg ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -19656,7 +25169,13 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_monotonic_ret_cmpxchg ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19672,7 +25191,13 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_monotonic_ret_cmpxchg ; ; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19688,7 +25213,13 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_monotonic_ret_cmpxchg ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -19704,7 +25235,13 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_monotonic_ret_cmpxchg ; ; GFX942-TGSPLIT-LABEL: global_cluster_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -19720,7 +25257,13 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_monotonic_ret_cmpxchg ; ; GFX11-WGP-LABEL: global_cluster_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19736,7 +25279,13 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_monotonic_ret_cmpxchg ; ; GFX11-CU-LABEL: global_cluster_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19752,7 +25301,13 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_monotonic_ret_cmpxchg ; ; GFX12-WGP-LABEL: global_cluster_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19768,7 +25323,13 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_monotonic_ret_cmpxchg ; ; GFX12-CU-LABEL: global_cluster_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19785,7 +25346,13 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_monotonic_ret_cmpxchg ; GFX1250-LABEL: global_cluster_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -19813,6 +25380,12 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_monotonic_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -19842,6 +25415,12 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -19870,7 +25449,13 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_cluster_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19888,7 +25473,13 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_cluster_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19908,6 +25499,12 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -19931,7 +25528,13 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19948,7 +25551,13 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19965,7 +25574,13 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_acquire_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -19982,7 +25597,13 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_cluster_one_as_acquire_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -19999,7 +25620,13 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_cluster_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20017,7 +25644,13 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_cluster_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20035,7 +25668,13 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_cluster_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20052,7 +25691,13 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_cluster_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20070,7 +25715,13 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_monotonic_ret_cmpxchg( ; GFX1250-LABEL: global_cluster_one_as_acquire_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -20099,6 +25750,12 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -20129,6 +25786,12 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -20158,7 +25821,13 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20178,7 +25847,13 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20200,6 +25875,12 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -20224,7 +25905,13 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20242,7 +25929,13 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20260,7 +25953,13 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -20279,7 +25978,13 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -20298,7 +26003,13 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20318,7 +26029,13 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20338,7 +26055,13 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20359,7 +26082,13 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20381,7 +26110,13 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX1250-LABEL: global_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -20412,6 +26147,12 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -20442,6 +26183,12 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -20471,7 +26218,13 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20491,7 +26244,13 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20513,6 +26272,12 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -20537,7 +26302,13 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20555,7 +26326,13 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20573,7 +26350,13 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -20592,7 +26375,13 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -20611,7 +26400,13 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20631,7 +26426,13 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20651,7 +26452,13 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20672,7 +26479,13 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20694,7 +26507,13 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX1250-LABEL: global_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -20725,6 +26544,12 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_acquire_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -20754,6 +26579,12 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -20782,7 +26613,13 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_cluster_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20800,7 +26637,13 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_cluster_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20820,6 +26663,12 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -20843,7 +26692,13 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20860,7 +26715,13 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20877,7 +26738,13 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_monotonic_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -20894,7 +26761,13 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_cluster_one_as_monotonic_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -20911,7 +26784,13 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_cluster_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20929,7 +26808,13 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_cluster_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20947,7 +26832,13 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_cluster_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20964,7 +26855,13 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_cluster_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20982,7 +26879,13 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_acquire_ret_cmpxchg( ; GFX1250-LABEL: global_cluster_one_as_monotonic_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -21011,6 +26914,12 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_acquire_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -21040,6 +26949,12 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -21068,7 +26983,13 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_cluster_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21086,7 +27007,13 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_cluster_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21106,6 +27033,12 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -21129,7 +27062,13 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21146,7 +27085,13 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21163,7 +27108,13 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_acquire_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -21180,7 +27131,13 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_cluster_one_as_acquire_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -21197,7 +27154,13 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_cluster_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21215,7 +27178,13 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_cluster_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21233,7 +27202,13 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_cluster_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21250,7 +27225,13 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_cluster_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21268,7 +27249,13 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_acquire_ret_cmpxchg( ; GFX1250-LABEL: global_cluster_one_as_acquire_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -21297,6 +27284,12 @@ define amdgpu_kernel void @global_cluster_one_as_release_acquire_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -21327,6 +27320,12 @@ define amdgpu_kernel void @global_cluster_one_as_release_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -21356,7 +27355,13 @@ define amdgpu_kernel void @global_cluster_one_as_release_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_cluster_one_as_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21376,7 +27381,13 @@ define amdgpu_kernel void @global_cluster_one_as_release_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_cluster_one_as_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21398,6 +27409,12 @@ define amdgpu_kernel void @global_cluster_one_as_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -21422,7 +27439,13 @@ define amdgpu_kernel void @global_cluster_one_as_release_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21440,7 +27463,13 @@ define amdgpu_kernel void @global_cluster_one_as_release_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21458,7 +27487,13 @@ define amdgpu_kernel void @global_cluster_one_as_release_acquire_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_release_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -21477,7 +27512,13 @@ define amdgpu_kernel void @global_cluster_one_as_release_acquire_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_cluster_one_as_release_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -21496,7 +27537,13 @@ define amdgpu_kernel void @global_cluster_one_as_release_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_cluster_one_as_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21516,7 +27563,13 @@ define amdgpu_kernel void @global_cluster_one_as_release_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_cluster_one_as_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21536,7 +27589,13 @@ define amdgpu_kernel void @global_cluster_one_as_release_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_cluster_one_as_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21557,7 +27616,13 @@ define amdgpu_kernel void @global_cluster_one_as_release_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_cluster_one_as_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21579,7 +27644,13 @@ define amdgpu_kernel void @global_cluster_one_as_release_acquire_ret_cmpxchg( ; GFX1250-LABEL: global_cluster_one_as_release_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -21610,6 +27681,12 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -21640,6 +27717,12 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -21669,7 +27752,13 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_cluster_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21689,7 +27778,13 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_cluster_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21711,6 +27806,12 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -21735,7 +27836,13 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21753,7 +27860,13 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21771,7 +27884,13 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -21790,7 +27909,13 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_cluster_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -21809,7 +27934,13 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_cluster_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21829,7 +27960,13 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_cluster_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21849,7 +27986,13 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_cluster_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21870,7 +28013,13 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_cluster_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21892,7 +28041,13 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX1250-LABEL: global_cluster_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -21923,6 +28078,12 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -21953,6 +28114,12 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -21982,7 +28149,13 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_cluster_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -22002,7 +28175,13 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_cluster_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -22024,6 +28203,12 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -22048,7 +28233,13 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -22066,7 +28257,13 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -22084,7 +28281,13 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -22103,7 +28306,13 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_cluster_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -22122,7 +28331,13 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_cluster_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -22142,7 +28357,13 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_cluster_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -22162,7 +28383,13 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_cluster_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -22183,7 +28410,13 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_cluster_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -22205,7 +28438,13 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX1250-LABEL: global_cluster_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -22236,6 +28475,12 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -22266,6 +28511,12 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -22295,7 +28546,13 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -22315,7 +28572,13 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -22337,6 +28600,12 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -22361,7 +28630,13 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -22379,7 +28654,13 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -22397,7 +28678,13 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -22416,7 +28703,13 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -22435,7 +28728,13 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -22455,7 +28754,13 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -22475,7 +28780,13 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -22496,7 +28807,13 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -22518,7 +28835,13 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: global_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -22549,6 +28872,12 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -22579,6 +28908,12 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -22608,7 +28943,13 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_cluster_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -22628,7 +28969,13 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_cluster_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -22650,6 +28997,12 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -22674,7 +29027,13 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -22692,7 +29051,13 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -22710,7 +29075,13 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -22729,7 +29100,13 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_cluster_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -22748,7 +29125,13 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_cluster_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -22768,7 +29151,13 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_cluster_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -22788,7 +29177,13 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_cluster_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -22809,7 +29204,13 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_cluster_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -22831,7 +29232,13 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: global_cluster_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -22862,6 +29269,12 @@ define amdgpu_kernel void @global_cluster_one_as_release_seq_cst_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -22892,6 +29305,12 @@ define amdgpu_kernel void @global_cluster_one_as_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -22921,7 +29340,13 @@ define amdgpu_kernel void @global_cluster_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_cluster_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -22941,7 +29366,13 @@ define amdgpu_kernel void @global_cluster_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_cluster_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -22963,6 +29394,12 @@ define amdgpu_kernel void @global_cluster_one_as_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -22987,7 +29424,13 @@ define amdgpu_kernel void @global_cluster_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -23005,7 +29448,13 @@ define amdgpu_kernel void @global_cluster_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -23023,7 +29472,13 @@ define amdgpu_kernel void @global_cluster_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_release_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -23042,7 +29497,13 @@ define amdgpu_kernel void @global_cluster_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_cluster_one_as_release_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -23061,7 +29522,13 @@ define amdgpu_kernel void @global_cluster_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_cluster_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -23081,7 +29548,13 @@ define amdgpu_kernel void @global_cluster_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_cluster_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -23101,7 +29574,13 @@ define amdgpu_kernel void @global_cluster_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_cluster_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -23122,7 +29601,13 @@ define amdgpu_kernel void @global_cluster_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_cluster_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -23144,7 +29629,13 @@ define amdgpu_kernel void @global_cluster_one_as_release_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: global_cluster_one_as_release_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -23175,6 +29666,12 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -23205,6 +29702,12 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -23234,7 +29737,13 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -23254,7 +29763,13 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -23276,6 +29791,12 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -23300,7 +29821,13 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -23318,7 +29845,13 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -23336,7 +29869,13 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -23355,7 +29894,13 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -23374,7 +29919,13 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -23394,7 +29945,13 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -23414,7 +29971,13 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -23435,7 +29998,13 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -23457,7 +30026,13 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: global_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -23488,6 +30063,12 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -23518,6 +30099,12 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -23547,7 +30134,13 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -23567,7 +30160,13 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -23589,6 +30188,12 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -23613,7 +30218,13 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -23631,7 +30242,13 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -23649,7 +30266,13 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -23668,7 +30291,13 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -23687,7 +30316,13 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -23707,7 +30342,13 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -23727,7 +30368,13 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -23748,7 +30395,13 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -23770,7 +30423,13 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: global_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll index 909ab710562b..79ac22943abc 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll @@ -6,10 +6,13 @@ define amdgpu_kernel void @global_last_use_load_0(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; GFX12-LABEL: global_last_use_load_0: ; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, s2 @@ -19,10 +22,13 @@ define amdgpu_kernel void @global_last_use_load_0(ptr addrspace(1) %in, ptr addr ; GFX1250-LABEL: global_last_use_load_0: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 @@ -37,17 +43,34 @@ entry: define amdgpu_kernel void @global_last_use_load_1(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; GFX12-LABEL: global_last_use_load_1: ; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_mov_b64 s[0:1], s[4:5] ; GFX12-NEXT: v_mov_b32_e32 v1, v0 -; GFX12-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX12-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-NEXT: s_mov_b32 s4, 0x3ff -; GFX12-NEXT: v_and_b32_e64 v1, v1, s4 -; GFX12-NEXT: s_mov_b32 s4, 2 -; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-NEXT: v_lshlrev_b32_e64 v1, s4, v1 +; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v1, v1, s[2:3] th:TH_LOAD_LU +; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_mov_b32 s2, 0x3ff +; GFX12-NEXT: v_and_b32_e64 v1, v1, s2 +; GFX12-NEXT: v_ashrrev_i32_e64 v3, 31, v1 +; GFX12-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-NEXT: s_mov_b32 s2, 2 +; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-NEXT: v_lshlrev_b64_e64 v[2:3], s2, v[1:2] +; GFX12-NEXT: s_mov_b32 s3, s4 +; GFX12-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-NEXT: s_mov_b32 s2, s5 +; GFX12-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-NEXT: v_add_co_u32 v1, s3, s3, v1 +; GFX12-NEXT: s_wait_alu depctr_va_sdst(0) +; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s2, s2, v2, s3 +; GFX12-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-NEXT: global_load_b32 v1, v[1:2], off th:TH_LOAD_LU ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_endpgm @@ -56,15 +79,19 @@ define amdgpu_kernel void @global_last_use_load_1(ptr addrspace(1) %in, ptr addr ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: v_mov_b32_e32 v1, v0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b32 s4, 0x3ff ; GFX1250-NEXT: v_and_b32_e64 v1, v1, s4 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_load_b32 v1, v1, s[2:3] scale_offset th:TH_LOAD_LU ; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm entry: @@ -78,27 +105,35 @@ entry: define amdgpu_kernel void @global_last_use_and_volatile_load(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; GFX12-LABEL: global_last_use_and_volatile_load: ; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_BYPASS scope:SCOPE_SYS ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_endpgm ; ; GFX1250-LABEL: global_last_use_and_volatile_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_BYPASS scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm entry: @@ -110,17 +145,34 @@ entry: define amdgpu_kernel void @global_last_use_and_nontemporal_load(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; GFX12-LABEL: global_last_use_and_nontemporal_load: ; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_mov_b64 s[0:1], s[4:5] ; GFX12-NEXT: v_mov_b32_e32 v1, v0 -; GFX12-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX12-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-NEXT: s_mov_b32 s4, 0x3ff -; GFX12-NEXT: v_and_b32_e64 v1, v1, s4 -; GFX12-NEXT: s_mov_b32 s4, 2 -; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-NEXT: v_lshlrev_b32_e64 v1, s4, v1 +; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v1, v1, s[2:3] th:TH_LOAD_LU +; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_mov_b32 s2, 0x3ff +; GFX12-NEXT: v_and_b32_e64 v1, v1, s2 +; GFX12-NEXT: v_ashrrev_i32_e64 v3, 31, v1 +; GFX12-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-NEXT: s_mov_b32 s2, 2 +; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-NEXT: v_lshlrev_b64_e64 v[2:3], s2, v[1:2] +; GFX12-NEXT: s_mov_b32 s3, s4 +; GFX12-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-NEXT: s_mov_b32 s2, s5 +; GFX12-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-NEXT: v_add_co_u32 v1, s3, s3, v1 +; GFX12-NEXT: s_wait_alu depctr_va_sdst(0) +; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s2, s2, v2, s3 +; GFX12-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-NEXT: global_load_b32 v1, v[1:2], off th:TH_LOAD_LU ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_endpgm @@ -129,15 +181,19 @@ define amdgpu_kernel void @global_last_use_and_nontemporal_load(ptr addrspace(1) ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: v_mov_b32_e32 v1, v0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b32 s4, 0x3ff ; GFX1250-NEXT: v_and_b32_e64 v1, v1, s4 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_load_b32 v1, v1, s[2:3] scale_offset th:TH_LOAD_LU ; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll index d25227b1fa86..4285cf21af75 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll @@ -18,6 +18,9 @@ define amdgpu_kernel void @global_nontemporal_load_0( ; GFX6-LABEL: global_nontemporal_load_0: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -41,6 +44,10 @@ define amdgpu_kernel void @global_nontemporal_load_0( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 @@ -53,10 +60,13 @@ define amdgpu_kernel void @global_nontemporal_load_0( ; ; GFX10-WGP-LABEL: global_nontemporal_load_0: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -65,10 +75,13 @@ define amdgpu_kernel void @global_nontemporal_load_0( ; ; GFX10-CU-LABEL: global_nontemporal_load_0: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -78,6 +91,9 @@ define amdgpu_kernel void @global_nontemporal_load_0( ; SKIP-CACHE-INV-LABEL: global_nontemporal_load_0: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -97,10 +113,13 @@ define amdgpu_kernel void @global_nontemporal_load_0( ; ; GFX90A-NOTTGSPLIT-LABEL: global_nontemporal_load_0: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -109,10 +128,13 @@ define amdgpu_kernel void @global_nontemporal_load_0( ; ; GFX90A-TGSPLIT-LABEL: global_nontemporal_load_0: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -121,10 +143,13 @@ define amdgpu_kernel void @global_nontemporal_load_0( ; ; GFX942-NOTTGSPLIT-LABEL: global_nontemporal_load_0: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -133,10 +158,13 @@ define amdgpu_kernel void @global_nontemporal_load_0( ; ; GFX942-TGSPLIT-LABEL: global_nontemporal_load_0: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -145,10 +173,13 @@ define amdgpu_kernel void @global_nontemporal_load_0( ; ; GFX11-WGP-LABEL: global_nontemporal_load_0: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -157,10 +188,13 @@ define amdgpu_kernel void @global_nontemporal_load_0( ; ; GFX11-CU-LABEL: global_nontemporal_load_0: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -169,10 +203,13 @@ define amdgpu_kernel void @global_nontemporal_load_0( ; ; GFX12-WGP-LABEL: global_nontemporal_load_0: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -181,10 +218,13 @@ define amdgpu_kernel void @global_nontemporal_load_0( ; ; GFX12-CU-LABEL: global_nontemporal_load_0: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -194,10 +234,13 @@ define amdgpu_kernel void @global_nontemporal_load_0( ; GFX1250-LABEL: global_nontemporal_load_0: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 @@ -214,6 +257,9 @@ define amdgpu_kernel void @global_nontemporal_load_1( ; GFX6-LABEL: global_nontemporal_load_1: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -230,11 +276,11 @@ define amdgpu_kernel void @global_nontemporal_load_1( ; GFX6-NEXT: s_mov_b32 s13, s10 ; GFX6-NEXT: ; kill: def $sgpr8_sgpr9 killed $sgpr8_sgpr9 def $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX6-NEXT: s_mov_b64 s[10:11], s[12:13] -; GFX6-NEXT: s_mov_b32 s12, 2 -; GFX6-NEXT: v_lshlrev_b32_e64 v0, s12, v0 -; GFX6-NEXT: v_mov_b32_e32 v2, 0 +; GFX6-NEXT: v_ashrrev_i32_e64 v2, 31, v0 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_mov_b32 s12, 2 +; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], s12 ; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 glc slc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -246,14 +292,17 @@ define amdgpu_kernel void @global_nontemporal_load_1( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 -; GFX7-NEXT: s_mov_b32 s6, 2 -; GFX7-NEXT: v_lshlrev_b32_e64 v1, s6, v0 -; GFX7-NEXT: v_mov_b32_e32 v0, 0 -; GFX7-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_ashrrev_i32_e64 v2, 31, v0 +; GFX7-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 2 +; GFX7-NEXT: v_lshl_b64 v[1:2], v[0:1], s6 ; GFX7-NEXT: s_mov_b32 s6, s8 ; GFX7-NEXT: v_mov_b32_e32 v0, v1 ; GFX7-NEXT: s_mov_b32 s8, s9 @@ -272,28 +321,60 @@ define amdgpu_kernel void @global_nontemporal_load_1( ; ; GFX10-WGP-LABEL: global_nontemporal_load_1: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_mov_b32 s8, 2 -; GFX10-WGP-NEXT: v_lshlrev_b32_e64 v1, s8, v1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: global_load_dword v1, v1, s[6:7] slc +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_nop 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: v_ashrrev_i32_e64 v3, 31, v1 +; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_mov_b32 s6, 2 +; GFX10-WGP-NEXT: v_lshlrev_b64 v[2:3], s6, v[1:2] +; GFX10-WGP-NEXT: s_mov_b32 s7, s8 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v2 +; GFX10-WGP-NEXT: s_mov_b32 s6, s9 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: v_add_co_u32 v1, s7, s7, v1 +; GFX10-WGP-NEXT: v_add_co_ci_u32_e64 v3, s6, s6, v2, s7 +; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: global_load_dword v1, v[1:2], off slc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_nontemporal_load_1: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_mov_b32 s8, 2 -; GFX10-CU-NEXT: v_lshlrev_b32_e64 v1, s8, v1 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: global_load_dword v1, v1, s[6:7] slc +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX10-CU-NEXT: s_nop 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: v_ashrrev_i32_e64 v3, 31, v1 +; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_mov_b32 s6, 2 +; GFX10-CU-NEXT: v_lshlrev_b64 v[2:3], s6, v[1:2] +; GFX10-CU-NEXT: s_mov_b32 s7, s8 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, v2 +; GFX10-CU-NEXT: s_mov_b32 s6, s9 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: v_add_co_u32 v1, s7, s7, v1 +; GFX10-CU-NEXT: v_add_co_ci_u32_e64 v3, s6, s6, v2, s7 +; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: global_load_dword v1, v[1:2], off slc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm @@ -301,6 +382,9 @@ define amdgpu_kernel void @global_nontemporal_load_1( ; SKIP-CACHE-INV-LABEL: global_nontemporal_load_1: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -317,11 +401,11 @@ define amdgpu_kernel void @global_nontemporal_load_1( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s9, s6 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], s[8:9] -; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, 2 -; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e64 v0, s8, v0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, 0 +; SKIP-CACHE-INV-NEXT: v_ashrrev_i32_e64 v2, 31, v0 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, 2 +; SKIP-CACHE-INV-NEXT: v_lshl_b64 v[0:1], v[0:1], s8 ; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 glc slc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -329,130 +413,246 @@ define amdgpu_kernel void @global_nontemporal_load_1( ; ; GFX90A-NOTTGSPLIT-LABEL: global_nontemporal_load_1: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s8, 0x3ff -; GFX90A-NOTTGSPLIT-NEXT: v_and_b32_e64 v1, v1, s8 -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s8, 2 -; GFX90A-NOTTGSPLIT-NEXT: v_lshlrev_b32_e64 v1, s8, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v1, s[6:7] glc slc +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s6, 0x3ff +; GFX90A-NOTTGSPLIT-NEXT: v_and_b32_e64 v2, v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_ashrrev_i32_e64 v1, 31, v2 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s6, 2 +; GFX90A-NOTTGSPLIT-NEXT: v_lshlrev_b64 v[2:3], s6, v[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s6, s8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s8, s9 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_add_co_u32_e64 v2, s[6:7], s6, v1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s8 +; GFX90A-NOTTGSPLIT-NEXT: v_addc_co_u32_e64 v1, s[6:7], v1, v3, s[6:7] +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v[2:3], off glc slc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_nontemporal_load_1: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_mov_b32 s8, 0x3ff -; GFX90A-TGSPLIT-NEXT: v_and_b32_e64 v1, v1, s8 -; GFX90A-TGSPLIT-NEXT: s_mov_b32 s8, 2 -; GFX90A-TGSPLIT-NEXT: v_lshlrev_b32_e64 v1, s8, v1 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v1, s[6:7] glc slc +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_nop 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_mov_b32 s6, 0x3ff +; GFX90A-TGSPLIT-NEXT: v_and_b32_e64 v2, v1, s6 +; GFX90A-TGSPLIT-NEXT: v_ashrrev_i32_e64 v1, 31, v2 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_mov_b32 s6, 2 +; GFX90A-TGSPLIT-NEXT: v_lshlrev_b64 v[2:3], s6, v[2:3] +; GFX90A-TGSPLIT-NEXT: s_mov_b32 s6, s8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-TGSPLIT-NEXT: s_mov_b32 s8, s9 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_add_co_u32_e64 v2, s[6:7], s6, v1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s8 +; GFX90A-TGSPLIT-NEXT: v_addc_co_u32_e64 v1, s[6:7], v1, v3, s[6:7] +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v[2:3], off glc slc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_nontemporal_load_1: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s4, 0x3ff -; GFX942-NOTTGSPLIT-NEXT: v_and_b32_e64 v1, v1, s4 -; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s4, 2 -; GFX942-NOTTGSPLIT-NEXT: v_lshlrev_b32_e64 v1, s4, v1 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NOTTGSPLIT-NEXT: global_load_dword v1, v1, s[2:3] nt +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s2, 0x3ff +; GFX942-NOTTGSPLIT-NEXT: v_and_b32_e64 v2, v1, s2 +; GFX942-NOTTGSPLIT-NEXT: v_ashrrev_i32_e64 v1, 31, v2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s2, 2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[4:5], s[4:5] +; GFX942-NOTTGSPLIT-NEXT: v_lshl_add_u64 v[2:3], v[2:3], s2, v[4:5] +; GFX942-NOTTGSPLIT-NEXT: global_load_dword v1, v[2:3], off nt ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_nontemporal_load_1: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[4:5] ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-TGSPLIT-NEXT: s_mov_b32 s4, 0x3ff -; GFX942-TGSPLIT-NEXT: v_and_b32_e64 v1, v1, s4 -; GFX942-TGSPLIT-NEXT: s_mov_b32 s4, 2 -; GFX942-TGSPLIT-NEXT: v_lshlrev_b32_e64 v1, s4, v1 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-TGSPLIT-NEXT: global_load_dword v1, v1, s[2:3] nt +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_mov_b32 s2, 0x3ff +; GFX942-TGSPLIT-NEXT: v_and_b32_e64 v2, v1, s2 +; GFX942-TGSPLIT-NEXT: v_ashrrev_i32_e64 v1, 31, v2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_mov_b32 s2, 2 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[4:5], s[4:5] +; GFX942-TGSPLIT-NEXT: v_lshl_add_u64 v[2:3], v[2:3], s2, v[4:5] +; GFX942-TGSPLIT-NEXT: global_load_dword v1, v[2:3], off nt ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_nontemporal_load_1: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[4:5] ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_mov_b32 s4, 0x3ff -; GFX11-WGP-NEXT: v_and_b32_e64 v1, v1, s4 -; GFX11-WGP-NEXT: s_mov_b32 s4, 2 -; GFX11-WGP-NEXT: v_lshlrev_b32_e64 v1, s4, v1 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: global_load_b32 v1, v1, s[2:3] slc dlc +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_mov_b32 s2, 0x3ff +; GFX11-WGP-NEXT: v_and_b32_e64 v1, v1, s2 +; GFX11-WGP-NEXT: v_ashrrev_i32_e64 v3, 31, v1 +; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_mov_b32 s2, 2 +; GFX11-WGP-NEXT: v_lshlrev_b64 v[2:3], s2, v[1:2] +; GFX11-WGP-NEXT: s_mov_b32 s3, s4 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-WGP-NEXT: s_mov_b32 s2, s5 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: v_add_co_u32 v1, s3, s3, v1 +; GFX11-WGP-NEXT: v_add_co_ci_u32_e64 v3, s2, s2, v2, s3 +; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: global_load_b32 v1, v[1:2], off slc dlc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_nontemporal_load_1: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[4:5] ; GFX11-CU-NEXT: v_mov_b32_e32 v1, v0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_mov_b32 s4, 0x3ff -; GFX11-CU-NEXT: v_and_b32_e64 v1, v1, s4 -; GFX11-CU-NEXT: s_mov_b32 s4, 2 -; GFX11-CU-NEXT: v_lshlrev_b32_e64 v1, s4, v1 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: global_load_b32 v1, v1, s[2:3] slc dlc +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_mov_b32 s2, 0x3ff +; GFX11-CU-NEXT: v_and_b32_e64 v1, v1, s2 +; GFX11-CU-NEXT: v_ashrrev_i32_e64 v3, 31, v1 +; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_mov_b32 s2, 2 +; GFX11-CU-NEXT: v_lshlrev_b64 v[2:3], s2, v[1:2] +; GFX11-CU-NEXT: s_mov_b32 s3, s4 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-CU-NEXT: s_mov_b32 s2, s5 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: v_add_co_u32 v1, s3, s3, v1 +; GFX11-CU-NEXT: v_add_co_ci_u32_e64 v3, s2, s2, v2, s3 +; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: global_load_b32 v1, v[1:2], off slc dlc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_nontemporal_load_1: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[4:5] ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_mov_b32 s4, 0x3ff -; GFX12-WGP-NEXT: v_and_b32_e64 v1, v1, s4 -; GFX12-WGP-NEXT: s_mov_b32 s4, 2 -; GFX12-WGP-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-WGP-NEXT: v_lshlrev_b32_e64 v1, s4, v1 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 -; GFX12-WGP-NEXT: global_load_b32 v1, v1, s[2:3] th:TH_LOAD_NT +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_mov_b32 s2, 0x3ff +; GFX12-WGP-NEXT: v_and_b32_e64 v1, v1, s2 +; GFX12-WGP-NEXT: v_ashrrev_i32_e64 v3, 31, v1 +; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: s_mov_b32 s2, 2 +; GFX12-WGP-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-WGP-NEXT: v_lshlrev_b64_e64 v[2:3], s2, v[1:2] +; GFX12-WGP-NEXT: s_mov_b32 s3, s4 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-WGP-NEXT: s_mov_b32 s2, s5 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-WGP-NEXT: v_add_co_u32 v1, s3, s3, v1 +; GFX12-WGP-NEXT: s_wait_alu depctr_va_sdst(0) +; GFX12-WGP-NEXT: v_add_co_ci_u32_e64 v3, s2, s2, v2, s3 +; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_load_b32 v1, v[1:2], off th:TH_LOAD_NT ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_nontemporal_load_1: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[4:5] ; GFX12-CU-NEXT: v_mov_b32_e32 v1, v0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_mov_b32 s4, 0x3ff -; GFX12-CU-NEXT: v_and_b32_e64 v1, v1, s4 -; GFX12-CU-NEXT: s_mov_b32 s4, 2 -; GFX12-CU-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-CU-NEXT: v_lshlrev_b32_e64 v1, s4, v1 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 -; GFX12-CU-NEXT: global_load_b32 v1, v1, s[2:3] th:TH_LOAD_NT +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_mov_b32 s2, 0x3ff +; GFX12-CU-NEXT: v_and_b32_e64 v1, v1, s2 +; GFX12-CU-NEXT: v_ashrrev_i32_e64 v3, 31, v1 +; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: s_mov_b32 s2, 2 +; GFX12-CU-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-CU-NEXT: v_lshlrev_b64_e64 v[2:3], s2, v[1:2] +; GFX12-CU-NEXT: s_mov_b32 s3, s4 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-CU-NEXT: s_mov_b32 s2, s5 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-CU-NEXT: v_add_co_u32 v1, s3, s3, v1 +; GFX12-CU-NEXT: s_wait_alu depctr_va_sdst(0) +; GFX12-CU-NEXT: v_add_co_ci_u32_e64 v3, s2, s2, v2, s3 +; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_load_b32 v1, v[1:2], off th:TH_LOAD_NT ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm @@ -461,15 +661,19 @@ define amdgpu_kernel void @global_nontemporal_load_1( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: v_mov_b32_e32 v1, v0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b32 s4, 0x3ff ; GFX1250-NEXT: v_and_b32_e64 v1, v1, s4 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_load_b32 v1, v1, s[2:3] scale_offset th:TH_LOAD_NT ; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -485,6 +689,9 @@ define amdgpu_kernel void @global_nontemporal_store_0( ; GFX6-LABEL: global_nontemporal_store_0: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -508,6 +715,10 @@ define amdgpu_kernel void @global_nontemporal_store_0( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 @@ -520,10 +731,13 @@ define amdgpu_kernel void @global_nontemporal_store_0( ; ; GFX10-WGP-LABEL: global_nontemporal_store_0: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -532,10 +746,13 @@ define amdgpu_kernel void @global_nontemporal_store_0( ; ; GFX10-CU-LABEL: global_nontemporal_store_0: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -545,6 +762,9 @@ define amdgpu_kernel void @global_nontemporal_store_0( ; SKIP-CACHE-INV-LABEL: global_nontemporal_store_0: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -564,10 +784,13 @@ define amdgpu_kernel void @global_nontemporal_store_0( ; ; GFX90A-NOTTGSPLIT-LABEL: global_nontemporal_store_0: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -576,10 +799,13 @@ define amdgpu_kernel void @global_nontemporal_store_0( ; ; GFX90A-TGSPLIT-LABEL: global_nontemporal_store_0: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -588,10 +814,13 @@ define amdgpu_kernel void @global_nontemporal_store_0( ; ; GFX942-NOTTGSPLIT-LABEL: global_nontemporal_store_0: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -600,10 +829,13 @@ define amdgpu_kernel void @global_nontemporal_store_0( ; ; GFX942-TGSPLIT-LABEL: global_nontemporal_store_0: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -612,10 +844,13 @@ define amdgpu_kernel void @global_nontemporal_store_0( ; ; GFX11-WGP-LABEL: global_nontemporal_store_0: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -624,10 +859,13 @@ define amdgpu_kernel void @global_nontemporal_store_0( ; ; GFX11-CU-LABEL: global_nontemporal_store_0: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -636,10 +874,13 @@ define amdgpu_kernel void @global_nontemporal_store_0( ; ; GFX12-WGP-LABEL: global_nontemporal_store_0: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -648,10 +889,13 @@ define amdgpu_kernel void @global_nontemporal_store_0( ; ; GFX12-CU-LABEL: global_nontemporal_store_0: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -661,10 +905,13 @@ define amdgpu_kernel void @global_nontemporal_store_0( ; GFX1250-LABEL: global_nontemporal_store_0: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 @@ -680,9 +927,12 @@ entry: define amdgpu_kernel void @global_nontemporal_store_1( ; GFX6-LABEL: global_nontemporal_store_1: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x0 ; GFX6-NEXT: s_mov_b32 s6, 0x100f000 ; GFX6-NEXT: s_mov_b32 s10, 0 @@ -690,11 +940,11 @@ define amdgpu_kernel void @global_nontemporal_store_1( ; GFX6-NEXT: s_mov_b32 s11, s6 ; GFX6-NEXT: ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7 ; GFX6-NEXT: s_mov_b64 s[6:7], s[10:11] +; GFX6-NEXT: v_ashrrev_i32_e64 v2, 31, v0 +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_mov_b32 s9, 2 -; GFX6-NEXT: v_lshlrev_b32_e64 v1, s9, v0 -; GFX6-NEXT: v_mov_b32_e32 v0, 0 -; GFX6-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; GFX6-NEXT: v_mov_b32_e32 v2, v0 +; GFX6-NEXT: v_lshl_b64 v[1:2], v[0:1], s9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s8 ; GFX6-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 glc slc @@ -706,14 +956,18 @@ define amdgpu_kernel void @global_nontemporal_store_1( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX7-NEXT: v_ashrrev_i32_e64 v2, 31, v0 +; GFX7-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_mov_b32 s5, 2 -; GFX7-NEXT: v_lshlrev_b32_e64 v1, s5, v0 -; GFX7-NEXT: v_mov_b32_e32 v0, 0 -; GFX7-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_lshl_b64 v[1:2], v[0:1], s5 ; GFX7-NEXT: s_mov_b32 s6, s8 ; GFX7-NEXT: v_mov_b32_e32 v0, v1 ; GFX7-NEXT: s_mov_b32 s5, s9 @@ -730,35 +984,70 @@ define amdgpu_kernel void @global_nontemporal_store_1( ; ; GFX10-WGP-LABEL: global_nontemporal_store_1: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_mov_b32 s7, 2 -; GFX10-WGP-NEXT: v_lshlrev_b32_e64 v0, s7, v0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_nop 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 -; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] glc slc +; GFX10-WGP-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX10-WGP-NEXT: v_ashrrev_i32_e64 v2, 31, v0 +; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v2 +; GFX10-WGP-NEXT: s_mov_b32 s5, 2 +; GFX10-WGP-NEXT: v_lshlrev_b64 v[1:2], s5, v[0:1] +; GFX10-WGP-NEXT: s_mov_b32 s6, s8 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, v1 +; GFX10-WGP-NEXT: s_mov_b32 s5, s9 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v2 +; GFX10-WGP-NEXT: v_add_co_u32 v0, s6, s6, v0 +; GFX10-WGP-NEXT: v_add_co_ci_u32_e64 v2, s5, s5, v1, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v2 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off glc slc ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_nontemporal_store_1: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x0 -; GFX10-CU-NEXT: s_mov_b32 s7, 2 -; GFX10-CU-NEXT: v_lshlrev_b32_e64 v0, s7, v0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_nop 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 -; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] glc slc +; GFX10-CU-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX10-CU-NEXT: v_ashrrev_i32_e64 v2, 31, v0 +; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v1, v2 +; GFX10-CU-NEXT: s_mov_b32 s5, 2 +; GFX10-CU-NEXT: v_lshlrev_b64 v[1:2], s5, v[0:1] +; GFX10-CU-NEXT: s_mov_b32 s6, s8 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, v1 +; GFX10-CU-NEXT: s_mov_b32 s5, s9 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, v2 +; GFX10-CU-NEXT: v_add_co_u32 v0, s6, s6, v0 +; GFX10-CU-NEXT: v_add_co_ci_u32_e64 v2, s5, s5, v1, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v1, v2 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off glc slc ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_nontemporal_store_1: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, 0xf000 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0 @@ -766,11 +1055,11 @@ define amdgpu_kernel void @global_nontemporal_store_1( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[6:7] +; SKIP-CACHE-INV-NEXT: v_ashrrev_i32_e64 v2, 31, v0 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, 2 -; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e64 v1, s5, v0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, 0 -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, v0 +; SKIP-CACHE-INV-NEXT: v_lshl_b64 v[1:2], v[0:1], s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 glc slc @@ -778,132 +1067,248 @@ define amdgpu_kernel void @global_nontemporal_store_1( ; ; GFX90A-NOTTGSPLIT-LABEL: global_nontemporal_store_1: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s7, 0x3ff -; GFX90A-NOTTGSPLIT-NEXT: v_and_b32_e64 v0, v0, s7 -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s7, 2 -; GFX90A-NOTTGSPLIT-NEXT: v_lshlrev_b32_e64 v0, s7, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] glc slc +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s5, 0x3ff +; GFX90A-NOTTGSPLIT-NEXT: v_and_b32_e64 v0, v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_ashrrev_i32_e64 v2, 31, v0 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s5, 2 +; GFX90A-NOTTGSPLIT-NEXT: v_lshlrev_b64 v[2:3], s5, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s6, s8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s5, s9 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NOTTGSPLIT-NEXT: v_add_co_u32_e64 v0, s[6:7], s6, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_addc_co_u32_e64 v2, s[6:7], v1, v2, s[6:7] +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v[0:1], v2, off glc slc ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_nontemporal_store_1: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_mov_b32 s7, 0x3ff -; GFX90A-TGSPLIT-NEXT: v_and_b32_e64 v0, v0, s7 -; GFX90A-TGSPLIT-NEXT: s_mov_b32 s7, 2 -; GFX90A-TGSPLIT-NEXT: v_lshlrev_b32_e64 v0, s7, v0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_nop 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 -; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] glc slc +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_mov_b32 s5, 0x3ff +; GFX90A-TGSPLIT-NEXT: v_and_b32_e64 v0, v0, s5 +; GFX90A-TGSPLIT-NEXT: v_ashrrev_i32_e64 v2, 31, v0 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-TGSPLIT-NEXT: s_mov_b32 s5, 2 +; GFX90A-TGSPLIT-NEXT: v_lshlrev_b64 v[2:3], s5, v[0:1] +; GFX90A-TGSPLIT-NEXT: s_mov_b32 s6, s8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-TGSPLIT-NEXT: s_mov_b32 s5, s9 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-TGSPLIT-NEXT: v_add_co_u32_e64 v0, s[6:7], s6, v0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_addc_co_u32_e64 v2, s[6:7], v1, v2, s[6:7] +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: global_store_dword v[0:1], v2, off glc slc ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_nontemporal_store_1: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 0x3ff -; GFX942-NOTTGSPLIT-NEXT: v_and_b32_e64 v0, v0, s3 -; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 2 -; GFX942-NOTTGSPLIT-NEXT: v_lshlrev_b32_e64 v0, s3, v0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] nt +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 0x3ff +; GFX942-NOTTGSPLIT-NEXT: v_and_b32_e64 v0, v0, s1 +; GFX942-NOTTGSPLIT-NEXT: v_ashrrev_i32_e64 v2, 31, v0 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: v_lshl_add_u64 v[0:1], v[0:1], s1, v[2:3] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: global_store_dword v[0:1], v2, off nt ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_nontemporal_store_1: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x0 -; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 0x3ff -; GFX942-TGSPLIT-NEXT: v_and_b32_e64 v0, v0, s3 -; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 2 -; GFX942-TGSPLIT-NEXT: v_lshlrev_b32_e64 v0, s3, v0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] nt +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 0x3ff +; GFX942-TGSPLIT-NEXT: v_and_b32_e64 v0, v0, s1 +; GFX942-TGSPLIT-NEXT: v_ashrrev_i32_e64 v2, 31, v0 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 2 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX942-TGSPLIT-NEXT: v_lshl_add_u64 v[0:1], v[0:1], s1, v[2:3] +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: global_store_dword v[0:1], v2, off nt ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_nontemporal_store_1: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_mov_b32 s3, 0x3ff -; GFX11-WGP-NEXT: v_and_b32_e64 v0, v0, s3 -; GFX11-WGP-NEXT: s_mov_b32 s3, 2 -; GFX11-WGP-NEXT: v_lshlrev_b32_e64 v0, s3, v0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[4:5], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] glc slc dlc +; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-WGP-NEXT: s_mov_b32 s1, 0x3ff +; GFX11-WGP-NEXT: v_and_b32_e64 v0, v0, s1 +; GFX11-WGP-NEXT: v_ashrrev_i32_e64 v2, 31, v0 +; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-WGP-NEXT: s_mov_b32 s1, 2 +; GFX11-WGP-NEXT: v_lshlrev_b64 v[1:2], s1, v[0:1] +; GFX11-WGP-NEXT: s_mov_b32 s2, s4 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, v1 +; GFX11-WGP-NEXT: s_mov_b32 s1, s5 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-WGP-NEXT: v_add_co_u32 v0, s2, s2, v0 +; GFX11-WGP-NEXT: v_add_co_ci_u32_e64 v2, s1, s1, v1, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: global_store_b32 v[0:1], v2, off glc slc dlc ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_nontemporal_store_1: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x0 -; GFX11-CU-NEXT: s_mov_b32 s3, 0x3ff -; GFX11-CU-NEXT: v_and_b32_e64 v0, v0, s3 -; GFX11-CU-NEXT: s_mov_b32 s3, 2 -; GFX11-CU-NEXT: v_lshlrev_b32_e64 v0, s3, v0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[4:5], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] glc slc dlc +; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-CU-NEXT: s_mov_b32 s1, 0x3ff +; GFX11-CU-NEXT: v_and_b32_e64 v0, v0, s1 +; GFX11-CU-NEXT: v_ashrrev_i32_e64 v2, 31, v0 +; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-CU-NEXT: s_mov_b32 s1, 2 +; GFX11-CU-NEXT: v_lshlrev_b64 v[1:2], s1, v[0:1] +; GFX11-CU-NEXT: s_mov_b32 s2, s4 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, v1 +; GFX11-CU-NEXT: s_mov_b32 s1, s5 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-CU-NEXT: v_add_co_u32 v0, s2, s2, v0 +; GFX11-CU-NEXT: v_add_co_ci_u32_e64 v2, s1, s1, v1, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: global_store_b32 v[0:1], v2, off glc slc dlc ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_nontemporal_store_1: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_mov_b32 s3, 0x3ff -; GFX12-WGP-NEXT: v_and_b32_e64 v0, v0, s3 -; GFX12-WGP-NEXT: s_mov_b32 s3, 2 -; GFX12-WGP-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-WGP-NEXT: v_lshlrev_b32_e64 v0, s3, v0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[4:5], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 -; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] th:TH_STORE_NT +; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX12-WGP-NEXT: s_mov_b32 s1, 0x3ff +; GFX12-WGP-NEXT: v_and_b32_e64 v0, v0, s1 +; GFX12-WGP-NEXT: v_ashrrev_i32_e64 v2, 31, v0 +; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-WGP-NEXT: s_mov_b32 s1, 2 +; GFX12-WGP-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-WGP-NEXT: v_lshlrev_b64_e64 v[1:2], s1, v[0:1] +; GFX12-WGP-NEXT: s_mov_b32 s2, s4 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, v1 +; GFX12-WGP-NEXT: s_mov_b32 s1, s5 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-WGP-NEXT: v_add_co_u32 v0, s2, s2, v0 +; GFX12-WGP-NEXT: s_wait_alu depctr_sa_sdst(0) depctr_va_sdst(0) +; GFX12-WGP-NEXT: v_add_co_ci_u32_e64 v2, s1, s1, v1, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_store_b32 v[0:1], v2, off th:TH_STORE_NT ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_nontemporal_store_1: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x0 -; GFX12-CU-NEXT: s_mov_b32 s3, 0x3ff -; GFX12-CU-NEXT: v_and_b32_e64 v0, v0, s3 -; GFX12-CU-NEXT: s_mov_b32 s3, 2 -; GFX12-CU-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-CU-NEXT: v_lshlrev_b32_e64 v0, s3, v0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[4:5], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 -; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] th:TH_STORE_NT +; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX12-CU-NEXT: s_mov_b32 s1, 0x3ff +; GFX12-CU-NEXT: v_and_b32_e64 v0, v0, s1 +; GFX12-CU-NEXT: v_ashrrev_i32_e64 v2, 31, v0 +; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-CU-NEXT: s_mov_b32 s1, 2 +; GFX12-CU-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-CU-NEXT: v_lshlrev_b64_e64 v[1:2], s1, v[0:1] +; GFX12-CU-NEXT: s_mov_b32 s2, s4 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, v1 +; GFX12-CU-NEXT: s_mov_b32 s1, s5 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-CU-NEXT: v_add_co_u32 v0, s2, s2, v0 +; GFX12-CU-NEXT: s_wait_alu depctr_sa_sdst(0) depctr_va_sdst(0) +; GFX12-CU-NEXT: v_add_co_ci_u32_e64 v2, s1, s1, v1, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: global_store_b32 v[0:1], v2, off th:TH_STORE_NT ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_nontemporal_store_1: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv +; GFX1250-NEXT: s_nop 0 ; GFX1250-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b32 s3, 0x3ff @@ -925,6 +1330,9 @@ define amdgpu_kernel void @global_nontemporal_volatile_load( ; GFX6-LABEL: global_nontemporal_volatile_load: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -952,13 +1360,16 @@ define amdgpu_kernel void @global_nontemporal_volatile_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] glc -; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: flat_store_dword v[0:1], v2 @@ -966,29 +1377,38 @@ define amdgpu_kernel void @global_nontemporal_volatile_load( ; ; GFX10-WGP-LABEL: global_nontemporal_volatile_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] glc dlc -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_nontemporal_volatile_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] glc dlc -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_nontemporal_volatile_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -1013,106 +1433,136 @@ define amdgpu_kernel void @global_nontemporal_volatile_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_nontemporal_volatile_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_nontemporal_volatile_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_nontemporal_volatile_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc0 sc1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_nontemporal_volatile_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc0 sc1 -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_nontemporal_volatile_load: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_nontemporal_volatile_load: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_nontemporal_volatile_load: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_nontemporal_volatile_load: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_nontemporal_volatile_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll index 539adc3ade31..1e5b1c198cda 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll @@ -18,6 +18,9 @@ define amdgpu_kernel void @global_singlethread_unordered_load( ; GFX6-LABEL: global_singlethread_unordered_load: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -45,12 +48,16 @@ define amdgpu_kernel void @global_singlethread_unordered_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -59,29 +66,38 @@ define amdgpu_kernel void @global_singlethread_unordered_load( ; ; GFX10-WGP-LABEL: global_singlethread_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_unordered_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -106,101 +122,131 @@ define amdgpu_kernel void @global_singlethread_unordered_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_unordered_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_singlethread_unordered_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_unordered_load: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_unordered_load: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_singlethread_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_singlethread_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_singlethread_unordered_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -214,6 +260,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_load( ; GFX6-LABEL: global_singlethread_monotonic_load: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -241,12 +290,16 @@ define amdgpu_kernel void @global_singlethread_monotonic_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -255,29 +308,38 @@ define amdgpu_kernel void @global_singlethread_monotonic_load( ; ; GFX10-WGP-LABEL: global_singlethread_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_monotonic_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -302,101 +364,131 @@ define amdgpu_kernel void @global_singlethread_monotonic_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_monotonic_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_singlethread_monotonic_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_monotonic_load: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_monotonic_load: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_singlethread_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_singlethread_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_singlethread_monotonic_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -410,6 +502,9 @@ define amdgpu_kernel void @global_singlethread_acquire_load( ; GFX6-LABEL: global_singlethread_acquire_load: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -437,12 +532,16 @@ define amdgpu_kernel void @global_singlethread_acquire_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -451,29 +550,38 @@ define amdgpu_kernel void @global_singlethread_acquire_load( ; ; GFX10-WGP-LABEL: global_singlethread_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_acquire_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -498,101 +606,131 @@ define amdgpu_kernel void @global_singlethread_acquire_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_acquire_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_singlethread_acquire_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_acquire_load: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_singlethread_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_singlethread_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_singlethread_acquire_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -606,6 +744,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_load( ; GFX6-LABEL: global_singlethread_seq_cst_load: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -633,12 +774,16 @@ define amdgpu_kernel void @global_singlethread_seq_cst_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -647,29 +792,38 @@ define amdgpu_kernel void @global_singlethread_seq_cst_load( ; ; GFX10-WGP-LABEL: global_singlethread_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_seq_cst_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -694,101 +848,131 @@ define amdgpu_kernel void @global_singlethread_seq_cst_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_singlethread_seq_cst_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_seq_cst_load: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_seq_cst_load: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_singlethread_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_singlethread_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_singlethread_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -802,6 +986,9 @@ define amdgpu_kernel void @global_singlethread_unordered_store( ; GFX6-LABEL: global_singlethread_unordered_store: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -823,6 +1010,10 @@ define amdgpu_kernel void @global_singlethread_unordered_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -833,27 +1024,38 @@ define amdgpu_kernel void @global_singlethread_unordered_store( ; ; GFX10-WGP-LABEL: global_singlethread_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_unordered_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -871,93 +1073,129 @@ define amdgpu_kernel void @global_singlethread_unordered_store( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_unordered_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_singlethread_unordered_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_unordered_store: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_unordered_store: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_singlethread_unordered_store: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_singlethread_unordered_store: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_singlethread_unordered_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -970,6 +1208,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_store( ; GFX6-LABEL: global_singlethread_monotonic_store: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -991,6 +1232,10 @@ define amdgpu_kernel void @global_singlethread_monotonic_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -1001,27 +1246,38 @@ define amdgpu_kernel void @global_singlethread_monotonic_store( ; ; GFX10-WGP-LABEL: global_singlethread_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_monotonic_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -1039,93 +1295,129 @@ define amdgpu_kernel void @global_singlethread_monotonic_store( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_monotonic_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_singlethread_monotonic_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_monotonic_store: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_monotonic_store: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_singlethread_monotonic_store: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_singlethread_monotonic_store: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_singlethread_monotonic_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -1138,6 +1430,9 @@ define amdgpu_kernel void @global_singlethread_release_store( ; GFX6-LABEL: global_singlethread_release_store: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -1159,6 +1454,10 @@ define amdgpu_kernel void @global_singlethread_release_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -1169,27 +1468,38 @@ define amdgpu_kernel void @global_singlethread_release_store( ; ; GFX10-WGP-LABEL: global_singlethread_release_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_release_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_release_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -1207,93 +1517,129 @@ define amdgpu_kernel void @global_singlethread_release_store( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_release_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_singlethread_release_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_release_store: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_release_store: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_singlethread_release_store: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_singlethread_release_store: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_singlethread_release_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -1306,6 +1652,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_store( ; GFX6-LABEL: global_singlethread_seq_cst_store: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -1327,6 +1676,10 @@ define amdgpu_kernel void @global_singlethread_seq_cst_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -1337,27 +1690,38 @@ define amdgpu_kernel void @global_singlethread_seq_cst_store( ; ; GFX10-WGP-LABEL: global_singlethread_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_seq_cst_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -1375,93 +1739,129 @@ define amdgpu_kernel void @global_singlethread_seq_cst_store( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_singlethread_seq_cst_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_seq_cst_store: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_seq_cst_store: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_singlethread_seq_cst_store: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_singlethread_seq_cst_store: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_singlethread_seq_cst_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -1474,6 +1874,10 @@ define amdgpu_kernel void @global_singlethread_monotonic_atomicrmw( ; GFX6-LABEL: global_singlethread_monotonic_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -1493,18 +1897,26 @@ define amdgpu_kernel void @global_singlethread_monotonic_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_singlethread_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1514,7 +1926,11 @@ define amdgpu_kernel void @global_singlethread_monotonic_atomicrmw( ; ; GFX10-CU-LABEL: global_singlethread_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1525,6 +1941,10 @@ define amdgpu_kernel void @global_singlethread_monotonic_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_singlethread_monotonic_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -1541,7 +1961,11 @@ define amdgpu_kernel void @global_singlethread_monotonic_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1551,7 +1975,11 @@ define amdgpu_kernel void @global_singlethread_monotonic_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1561,7 +1989,11 @@ define amdgpu_kernel void @global_singlethread_monotonic_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_monotonic_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1571,7 +2003,11 @@ define amdgpu_kernel void @global_singlethread_monotonic_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_singlethread_monotonic_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1581,7 +2017,11 @@ define amdgpu_kernel void @global_singlethread_monotonic_atomicrmw( ; ; GFX11-WGP-LABEL: global_singlethread_monotonic_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1591,7 +2031,11 @@ define amdgpu_kernel void @global_singlethread_monotonic_atomicrmw( ; ; GFX11-CU-LABEL: global_singlethread_monotonic_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1601,7 +2045,11 @@ define amdgpu_kernel void @global_singlethread_monotonic_atomicrmw( ; ; GFX12-WGP-LABEL: global_singlethread_monotonic_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -1611,7 +2059,11 @@ define amdgpu_kernel void @global_singlethread_monotonic_atomicrmw( ; ; GFX12-CU-LABEL: global_singlethread_monotonic_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -1622,7 +2074,11 @@ define amdgpu_kernel void @global_singlethread_monotonic_atomicrmw( ; GFX1250-LABEL: global_singlethread_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -1640,6 +2096,10 @@ define amdgpu_kernel void @global_singlethread_acquire_atomicrmw( ; GFX6-LABEL: global_singlethread_acquire_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -1659,18 +2119,26 @@ define amdgpu_kernel void @global_singlethread_acquire_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_singlethread_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1680,7 +2148,11 @@ define amdgpu_kernel void @global_singlethread_acquire_atomicrmw( ; ; GFX10-CU-LABEL: global_singlethread_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1691,6 +2163,10 @@ define amdgpu_kernel void @global_singlethread_acquire_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_singlethread_acquire_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -1707,7 +2183,11 @@ define amdgpu_kernel void @global_singlethread_acquire_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1717,7 +2197,11 @@ define amdgpu_kernel void @global_singlethread_acquire_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1727,7 +2211,11 @@ define amdgpu_kernel void @global_singlethread_acquire_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_acquire_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1737,7 +2225,11 @@ define amdgpu_kernel void @global_singlethread_acquire_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_singlethread_acquire_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1747,7 +2239,11 @@ define amdgpu_kernel void @global_singlethread_acquire_atomicrmw( ; ; GFX11-WGP-LABEL: global_singlethread_acquire_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1757,7 +2253,11 @@ define amdgpu_kernel void @global_singlethread_acquire_atomicrmw( ; ; GFX11-CU-LABEL: global_singlethread_acquire_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1767,7 +2267,11 @@ define amdgpu_kernel void @global_singlethread_acquire_atomicrmw( ; ; GFX12-WGP-LABEL: global_singlethread_acquire_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -1777,7 +2281,11 @@ define amdgpu_kernel void @global_singlethread_acquire_atomicrmw( ; ; GFX12-CU-LABEL: global_singlethread_acquire_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -1788,7 +2296,11 @@ define amdgpu_kernel void @global_singlethread_acquire_atomicrmw( ; GFX1250-LABEL: global_singlethread_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -1806,6 +2318,10 @@ define amdgpu_kernel void @global_singlethread_release_atomicrmw( ; GFX6-LABEL: global_singlethread_release_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -1825,18 +2341,26 @@ define amdgpu_kernel void @global_singlethread_release_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_singlethread_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1846,7 +2370,11 @@ define amdgpu_kernel void @global_singlethread_release_atomicrmw( ; ; GFX10-CU-LABEL: global_singlethread_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1857,6 +2385,10 @@ define amdgpu_kernel void @global_singlethread_release_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_singlethread_release_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -1873,7 +2405,11 @@ define amdgpu_kernel void @global_singlethread_release_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1883,7 +2419,11 @@ define amdgpu_kernel void @global_singlethread_release_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1893,7 +2433,11 @@ define amdgpu_kernel void @global_singlethread_release_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_release_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1903,7 +2447,11 @@ define amdgpu_kernel void @global_singlethread_release_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_singlethread_release_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1913,7 +2461,11 @@ define amdgpu_kernel void @global_singlethread_release_atomicrmw( ; ; GFX11-WGP-LABEL: global_singlethread_release_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1923,7 +2475,11 @@ define amdgpu_kernel void @global_singlethread_release_atomicrmw( ; ; GFX11-CU-LABEL: global_singlethread_release_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1933,7 +2489,11 @@ define amdgpu_kernel void @global_singlethread_release_atomicrmw( ; ; GFX12-WGP-LABEL: global_singlethread_release_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -1943,7 +2503,11 @@ define amdgpu_kernel void @global_singlethread_release_atomicrmw( ; ; GFX12-CU-LABEL: global_singlethread_release_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -1954,7 +2518,11 @@ define amdgpu_kernel void @global_singlethread_release_atomicrmw( ; GFX1250-LABEL: global_singlethread_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -1972,6 +2540,10 @@ define amdgpu_kernel void @global_singlethread_acq_rel_atomicrmw( ; GFX6-LABEL: global_singlethread_acq_rel_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -1991,18 +2563,26 @@ define amdgpu_kernel void @global_singlethread_acq_rel_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_singlethread_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2012,7 +2592,11 @@ define amdgpu_kernel void @global_singlethread_acq_rel_atomicrmw( ; ; GFX10-CU-LABEL: global_singlethread_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2023,6 +2607,10 @@ define amdgpu_kernel void @global_singlethread_acq_rel_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_singlethread_acq_rel_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -2039,7 +2627,11 @@ define amdgpu_kernel void @global_singlethread_acq_rel_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2049,7 +2641,11 @@ define amdgpu_kernel void @global_singlethread_acq_rel_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2059,7 +2655,11 @@ define amdgpu_kernel void @global_singlethread_acq_rel_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2069,7 +2669,11 @@ define amdgpu_kernel void @global_singlethread_acq_rel_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_singlethread_acq_rel_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2079,7 +2683,11 @@ define amdgpu_kernel void @global_singlethread_acq_rel_atomicrmw( ; ; GFX11-WGP-LABEL: global_singlethread_acq_rel_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2089,7 +2697,11 @@ define amdgpu_kernel void @global_singlethread_acq_rel_atomicrmw( ; ; GFX11-CU-LABEL: global_singlethread_acq_rel_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2099,7 +2711,11 @@ define amdgpu_kernel void @global_singlethread_acq_rel_atomicrmw( ; ; GFX12-WGP-LABEL: global_singlethread_acq_rel_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -2109,7 +2725,11 @@ define amdgpu_kernel void @global_singlethread_acq_rel_atomicrmw( ; ; GFX12-CU-LABEL: global_singlethread_acq_rel_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -2120,7 +2740,11 @@ define amdgpu_kernel void @global_singlethread_acq_rel_atomicrmw( ; GFX1250-LABEL: global_singlethread_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2138,6 +2762,10 @@ define amdgpu_kernel void @global_singlethread_seq_cst_atomicrmw( ; GFX6-LABEL: global_singlethread_seq_cst_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -2157,18 +2785,26 @@ define amdgpu_kernel void @global_singlethread_seq_cst_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_singlethread_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2178,7 +2814,11 @@ define amdgpu_kernel void @global_singlethread_seq_cst_atomicrmw( ; ; GFX10-CU-LABEL: global_singlethread_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2189,6 +2829,10 @@ define amdgpu_kernel void @global_singlethread_seq_cst_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_singlethread_seq_cst_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -2205,7 +2849,11 @@ define amdgpu_kernel void @global_singlethread_seq_cst_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2215,7 +2863,11 @@ define amdgpu_kernel void @global_singlethread_seq_cst_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2225,7 +2877,11 @@ define amdgpu_kernel void @global_singlethread_seq_cst_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2235,7 +2891,11 @@ define amdgpu_kernel void @global_singlethread_seq_cst_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_singlethread_seq_cst_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2245,7 +2905,11 @@ define amdgpu_kernel void @global_singlethread_seq_cst_atomicrmw( ; ; GFX11-WGP-LABEL: global_singlethread_seq_cst_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2255,7 +2919,11 @@ define amdgpu_kernel void @global_singlethread_seq_cst_atomicrmw( ; ; GFX11-CU-LABEL: global_singlethread_seq_cst_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2265,7 +2933,11 @@ define amdgpu_kernel void @global_singlethread_seq_cst_atomicrmw( ; ; GFX12-WGP-LABEL: global_singlethread_seq_cst_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -2275,7 +2947,11 @@ define amdgpu_kernel void @global_singlethread_seq_cst_atomicrmw( ; ; GFX12-CU-LABEL: global_singlethread_seq_cst_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -2286,7 +2962,11 @@ define amdgpu_kernel void @global_singlethread_seq_cst_atomicrmw( ; GFX1250-LABEL: global_singlethread_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2304,6 +2984,10 @@ define amdgpu_kernel void @global_singlethread_acquire_ret_atomicrmw( ; GFX6-LABEL: global_singlethread_acquire_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -2326,6 +3010,10 @@ define amdgpu_kernel void @global_singlethread_acquire_ret_atomicrmw( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -2340,7 +3028,11 @@ define amdgpu_kernel void @global_singlethread_acquire_ret_atomicrmw( ; ; GFX10-WGP-LABEL: global_singlethread_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2352,7 +3044,11 @@ define amdgpu_kernel void @global_singlethread_acquire_ret_atomicrmw( ; ; GFX10-CU-LABEL: global_singlethread_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2365,6 +3061,10 @@ define amdgpu_kernel void @global_singlethread_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_singlethread_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -2383,7 +3083,11 @@ define amdgpu_kernel void @global_singlethread_acquire_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2395,7 +3099,11 @@ define amdgpu_kernel void @global_singlethread_acquire_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2407,7 +3115,11 @@ define amdgpu_kernel void @global_singlethread_acquire_ret_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_acquire_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2419,7 +3131,11 @@ define amdgpu_kernel void @global_singlethread_acquire_ret_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_singlethread_acquire_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2431,7 +3147,11 @@ define amdgpu_kernel void @global_singlethread_acquire_ret_atomicrmw( ; ; GFX11-WGP-LABEL: global_singlethread_acquire_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2443,7 +3163,11 @@ define amdgpu_kernel void @global_singlethread_acquire_ret_atomicrmw( ; ; GFX11-CU-LABEL: global_singlethread_acquire_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2455,7 +3179,11 @@ define amdgpu_kernel void @global_singlethread_acquire_ret_atomicrmw( ; ; GFX12-WGP-LABEL: global_singlethread_acquire_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -2467,7 +3195,11 @@ define amdgpu_kernel void @global_singlethread_acquire_ret_atomicrmw( ; ; GFX12-CU-LABEL: global_singlethread_acquire_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -2480,7 +3212,11 @@ define amdgpu_kernel void @global_singlethread_acquire_ret_atomicrmw( ; GFX1250-LABEL: global_singlethread_acquire_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2501,6 +3237,10 @@ define amdgpu_kernel void @global_singlethread_acq_rel_ret_atomicrmw( ; GFX6-LABEL: global_singlethread_acq_rel_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -2523,6 +3263,10 @@ define amdgpu_kernel void @global_singlethread_acq_rel_ret_atomicrmw( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -2537,7 +3281,11 @@ define amdgpu_kernel void @global_singlethread_acq_rel_ret_atomicrmw( ; ; GFX10-WGP-LABEL: global_singlethread_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2549,7 +3297,11 @@ define amdgpu_kernel void @global_singlethread_acq_rel_ret_atomicrmw( ; ; GFX10-CU-LABEL: global_singlethread_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2562,6 +3314,10 @@ define amdgpu_kernel void @global_singlethread_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_singlethread_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -2580,7 +3336,11 @@ define amdgpu_kernel void @global_singlethread_acq_rel_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2592,7 +3352,11 @@ define amdgpu_kernel void @global_singlethread_acq_rel_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2604,7 +3368,11 @@ define amdgpu_kernel void @global_singlethread_acq_rel_ret_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2616,7 +3384,11 @@ define amdgpu_kernel void @global_singlethread_acq_rel_ret_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_singlethread_acq_rel_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2628,7 +3400,11 @@ define amdgpu_kernel void @global_singlethread_acq_rel_ret_atomicrmw( ; ; GFX11-WGP-LABEL: global_singlethread_acq_rel_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2640,7 +3416,11 @@ define amdgpu_kernel void @global_singlethread_acq_rel_ret_atomicrmw( ; ; GFX11-CU-LABEL: global_singlethread_acq_rel_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2652,7 +3432,11 @@ define amdgpu_kernel void @global_singlethread_acq_rel_ret_atomicrmw( ; ; GFX12-WGP-LABEL: global_singlethread_acq_rel_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -2664,7 +3448,11 @@ define amdgpu_kernel void @global_singlethread_acq_rel_ret_atomicrmw( ; ; GFX12-CU-LABEL: global_singlethread_acq_rel_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -2677,7 +3465,11 @@ define amdgpu_kernel void @global_singlethread_acq_rel_ret_atomicrmw( ; GFX1250-LABEL: global_singlethread_acq_rel_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2698,6 +3490,10 @@ define amdgpu_kernel void @global_singlethread_seq_cst_ret_atomicrmw( ; GFX6-LABEL: global_singlethread_seq_cst_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -2720,6 +3516,10 @@ define amdgpu_kernel void @global_singlethread_seq_cst_ret_atomicrmw( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -2734,7 +3534,11 @@ define amdgpu_kernel void @global_singlethread_seq_cst_ret_atomicrmw( ; ; GFX10-WGP-LABEL: global_singlethread_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2746,7 +3550,11 @@ define amdgpu_kernel void @global_singlethread_seq_cst_ret_atomicrmw( ; ; GFX10-CU-LABEL: global_singlethread_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2759,6 +3567,10 @@ define amdgpu_kernel void @global_singlethread_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_singlethread_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -2777,7 +3589,11 @@ define amdgpu_kernel void @global_singlethread_seq_cst_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2789,7 +3605,11 @@ define amdgpu_kernel void @global_singlethread_seq_cst_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2801,7 +3621,11 @@ define amdgpu_kernel void @global_singlethread_seq_cst_ret_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2813,7 +3637,11 @@ define amdgpu_kernel void @global_singlethread_seq_cst_ret_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_singlethread_seq_cst_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2825,7 +3653,11 @@ define amdgpu_kernel void @global_singlethread_seq_cst_ret_atomicrmw( ; ; GFX11-WGP-LABEL: global_singlethread_seq_cst_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2837,7 +3669,11 @@ define amdgpu_kernel void @global_singlethread_seq_cst_ret_atomicrmw( ; ; GFX11-CU-LABEL: global_singlethread_seq_cst_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2849,7 +3685,11 @@ define amdgpu_kernel void @global_singlethread_seq_cst_ret_atomicrmw( ; ; GFX12-WGP-LABEL: global_singlethread_seq_cst_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -2861,7 +3701,11 @@ define amdgpu_kernel void @global_singlethread_seq_cst_ret_atomicrmw( ; ; GFX12-CU-LABEL: global_singlethread_seq_cst_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -2874,7 +3718,11 @@ define amdgpu_kernel void @global_singlethread_seq_cst_ret_atomicrmw( ; GFX1250-LABEL: global_singlethread_seq_cst_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2896,6 +3744,12 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -2920,11 +3774,16 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -2934,6 +3793,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -2944,7 +3804,13 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_singlethread_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -2958,7 +3824,13 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_singlethread_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -2974,6 +3846,12 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -2994,7 +3872,13 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3008,7 +3892,13 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3022,7 +3912,13 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_monotonic_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -3036,7 +3932,13 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_singlethread_monotonic_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -3050,7 +3952,13 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_singlethread_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3064,7 +3972,13 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_singlethread_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3078,7 +3992,13 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_singlethread_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3092,7 +4012,13 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_singlethread_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3107,7 +4033,13 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_cmpxchg( ; GFX1250-LABEL: global_singlethread_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -3131,6 +4063,12 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -3155,11 +4093,16 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -3169,6 +4112,7 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -3179,7 +4123,13 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_singlethread_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3193,7 +4143,13 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_singlethread_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3209,6 +4165,12 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -3229,7 +4191,13 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3243,7 +4211,13 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3257,7 +4231,13 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_acquire_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -3271,7 +4251,13 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_singlethread_acquire_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -3285,7 +4271,13 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_singlethread_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3299,7 +4291,13 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_singlethread_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3313,7 +4311,13 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_singlethread_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3327,7 +4331,13 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_singlethread_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3342,7 +4352,13 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_cmpxchg( ; GFX1250-LABEL: global_singlethread_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -3366,6 +4382,12 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -3390,11 +4412,16 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -3404,6 +4431,7 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -3414,7 +4442,13 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_singlethread_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3428,7 +4462,13 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_singlethread_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3444,6 +4484,12 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -3464,7 +4510,13 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3478,7 +4530,13 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3492,7 +4550,13 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_release_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -3506,7 +4570,13 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_singlethread_release_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -3520,7 +4590,13 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_singlethread_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3534,7 +4610,13 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_singlethread_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3548,7 +4630,13 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_singlethread_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3562,7 +4650,13 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_singlethread_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3577,7 +4671,13 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg( ; GFX1250-LABEL: global_singlethread_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -3601,6 +4701,12 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -3625,11 +4731,16 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -3639,6 +4750,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -3649,7 +4761,13 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3663,7 +4781,13 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3679,6 +4803,12 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -3699,7 +4829,13 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3713,7 +4849,13 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3727,7 +4869,13 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -3741,7 +4889,13 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -3755,7 +4909,13 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3769,7 +4929,13 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3783,7 +4949,13 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3797,7 +4969,13 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3812,7 +4990,13 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg( ; GFX1250-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -3836,6 +5020,12 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -3860,11 +5050,16 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -3874,6 +5069,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -3884,7 +5080,13 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3898,7 +5100,13 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3914,6 +5122,12 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -3934,7 +5148,13 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3948,7 +5168,13 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3962,7 +5188,13 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -3976,7 +5208,13 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -3990,7 +5228,13 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4004,7 +5248,13 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4018,7 +5268,13 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4032,7 +5288,13 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4047,7 +5309,13 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg( ; GFX1250-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -4071,6 +5339,12 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -4095,11 +5369,16 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -4109,6 +5388,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -4119,7 +5399,13 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_singlethread_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4133,7 +5419,13 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_singlethread_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4149,6 +5441,12 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -4169,7 +5467,13 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4183,7 +5487,13 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4197,7 +5507,13 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_monotonic_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -4211,7 +5527,13 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_singlethread_monotonic_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -4225,7 +5547,13 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_singlethread_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4239,7 +5567,13 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_singlethread_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4253,7 +5587,13 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_singlethread_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4267,7 +5607,13 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_singlethread_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4282,7 +5628,13 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_cmpxchg( ; GFX1250-LABEL: global_singlethread_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -4306,6 +5658,12 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -4330,11 +5688,16 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -4344,6 +5707,7 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -4354,7 +5718,13 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_singlethread_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4368,7 +5738,13 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_singlethread_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4384,6 +5760,12 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -4404,7 +5786,13 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4418,7 +5806,13 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4432,7 +5826,13 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_acquire_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -4446,7 +5846,13 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_singlethread_acquire_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -4460,7 +5866,13 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_singlethread_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4474,7 +5886,13 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_singlethread_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4488,7 +5906,13 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_singlethread_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4502,7 +5926,13 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_singlethread_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4517,7 +5947,13 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg( ; GFX1250-LABEL: global_singlethread_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -4541,6 +5977,12 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -4565,11 +6007,16 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -4579,6 +6026,7 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -4589,7 +6037,13 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_singlethread_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4603,7 +6057,13 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_singlethread_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4619,6 +6079,12 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -4639,7 +6105,13 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4653,7 +6125,13 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4667,7 +6145,13 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_release_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -4681,7 +6165,13 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_singlethread_release_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -4695,7 +6185,13 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_singlethread_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4709,7 +6205,13 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_singlethread_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4723,7 +6225,13 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_singlethread_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4737,7 +6245,13 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_singlethread_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4752,7 +6266,13 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg( ; GFX1250-LABEL: global_singlethread_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -4776,6 +6296,12 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -4800,11 +6326,16 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -4814,6 +6345,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -4824,7 +6356,13 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_singlethread_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4838,7 +6376,13 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_singlethread_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4854,6 +6398,12 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -4874,7 +6424,13 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4888,7 +6444,13 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4902,7 +6464,13 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -4916,7 +6484,13 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_singlethread_acq_rel_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -4930,7 +6504,13 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_singlethread_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4944,7 +6524,13 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_singlethread_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4958,7 +6544,13 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_singlethread_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4972,7 +6564,13 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_singlethread_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4987,7 +6585,13 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg( ; GFX1250-LABEL: global_singlethread_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -5011,6 +6615,12 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5035,11 +6645,16 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -5049,6 +6664,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -5059,7 +6675,13 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_singlethread_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5073,7 +6695,13 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_singlethread_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5089,6 +6717,12 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -5109,7 +6743,13 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5123,7 +6763,13 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5137,7 +6783,13 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -5151,7 +6803,13 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_singlethread_seq_cst_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -5165,7 +6823,13 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_singlethread_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5179,7 +6843,13 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_singlethread_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5193,7 +6863,13 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_singlethread_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5207,7 +6883,13 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_singlethread_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5222,7 +6904,13 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg( ; GFX1250-LABEL: global_singlethread_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -5246,6 +6934,12 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5270,11 +6964,16 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -5284,6 +6983,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -5294,7 +6994,13 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5308,7 +7014,13 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5324,6 +7036,12 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -5344,7 +7062,13 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5358,7 +7082,13 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5372,7 +7102,13 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -5386,7 +7122,13 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -5400,7 +7142,13 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5414,7 +7162,13 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5428,7 +7182,13 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5442,7 +7202,13 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5457,7 +7223,13 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg( ; GFX1250-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -5481,6 +7253,12 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5505,11 +7283,16 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -5519,6 +7302,7 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -5529,7 +7313,13 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_singlethread_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5543,7 +7333,13 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_singlethread_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5559,6 +7355,12 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -5579,7 +7381,13 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5593,7 +7401,13 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5607,7 +7421,13 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_acquire_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -5621,7 +7441,13 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_singlethread_acquire_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -5635,7 +7461,13 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_singlethread_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5649,7 +7481,13 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_singlethread_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5663,7 +7501,13 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_singlethread_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5677,7 +7521,13 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_singlethread_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5692,7 +7542,13 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg( ; GFX1250-LABEL: global_singlethread_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -5716,6 +7572,12 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5740,11 +7602,16 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -5754,6 +7621,7 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -5764,7 +7632,13 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_singlethread_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5778,7 +7652,13 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_singlethread_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5794,6 +7674,12 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -5814,7 +7700,13 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5828,7 +7720,13 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5842,7 +7740,13 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_release_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -5856,7 +7760,13 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_singlethread_release_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -5870,7 +7780,13 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_singlethread_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5884,7 +7800,13 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_singlethread_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5898,7 +7820,13 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_singlethread_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5912,7 +7840,13 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_singlethread_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5927,7 +7861,13 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg( ; GFX1250-LABEL: global_singlethread_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -5951,6 +7891,12 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5975,11 +7921,16 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -5989,6 +7940,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -5999,7 +7951,13 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6013,7 +7971,13 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6029,6 +7993,12 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -6049,7 +8019,13 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6063,7 +8039,13 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6077,7 +8059,13 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -6091,7 +8079,13 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -6105,7 +8099,13 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6119,7 +8119,13 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6133,7 +8139,13 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6147,7 +8159,13 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6162,7 +8180,13 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX1250-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -6186,6 +8210,12 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6210,11 +8240,16 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -6224,6 +8259,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -6234,7 +8270,13 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6248,7 +8290,13 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6264,6 +8312,12 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -6284,7 +8338,13 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6298,7 +8358,13 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6312,7 +8378,13 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -6326,7 +8398,13 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -6340,7 +8418,13 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6354,7 +8438,13 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6368,7 +8458,13 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6382,7 +8478,13 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6397,7 +8499,13 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX1250-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -6421,6 +8529,12 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6449,6 +8563,12 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -6476,7 +8596,13 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6492,7 +8618,13 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6510,6 +8642,12 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -6533,7 +8671,13 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6549,7 +8693,13 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6565,7 +8715,13 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -6581,7 +8737,13 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -6597,7 +8759,13 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6613,7 +8781,13 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6629,7 +8803,13 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6645,7 +8825,13 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6662,7 +8848,13 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_ret_cmpxchg( ; GFX1250-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -6690,6 +8882,12 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6718,6 +8916,12 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -6745,7 +8949,13 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6761,7 +8971,13 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6779,6 +8995,12 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -6802,7 +9024,13 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6818,7 +9046,13 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6834,7 +9068,13 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -6850,7 +9090,13 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -6866,7 +9112,13 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6882,7 +9134,13 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6898,7 +9156,13 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6914,7 +9178,13 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6931,7 +9201,13 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX1250-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -6959,6 +9235,12 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6987,6 +9269,12 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -7014,7 +9302,13 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_singlethread_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7030,7 +9324,13 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_singlethread_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7048,6 +9348,12 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -7071,7 +9377,13 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7087,7 +9399,13 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7103,7 +9421,13 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_release_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -7119,7 +9443,13 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_singlethread_release_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -7135,7 +9465,13 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_singlethread_release_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7151,7 +9487,13 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_singlethread_release_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7167,7 +9509,13 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_singlethread_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7183,7 +9531,13 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_singlethread_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7200,7 +9554,13 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg( ; GFX1250-LABEL: global_singlethread_release_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -7228,6 +9588,12 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7256,6 +9622,12 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -7283,7 +9655,13 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7299,7 +9677,13 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7317,6 +9701,12 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -7340,7 +9730,13 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7356,7 +9752,13 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7372,7 +9774,13 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -7388,7 +9796,13 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -7404,7 +9818,13 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7420,7 +9840,13 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7436,7 +9862,13 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7452,7 +9884,13 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7469,7 +9907,13 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX1250-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -7497,6 +9941,12 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7525,6 +9975,12 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -7552,7 +10008,13 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7568,7 +10030,13 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7586,6 +10054,12 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -7609,7 +10083,13 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7625,7 +10105,13 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7641,7 +10127,13 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -7657,7 +10149,13 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -7673,7 +10171,13 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7689,7 +10193,13 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7705,7 +10215,13 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7721,7 +10237,13 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7738,7 +10260,13 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX1250-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -7766,6 +10294,12 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7794,6 +10328,12 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -7821,7 +10361,13 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7837,7 +10383,13 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7855,6 +10407,12 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -7878,7 +10436,13 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7894,7 +10458,13 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7910,7 +10480,13 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -7926,7 +10502,13 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -7942,7 +10524,13 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7958,7 +10546,13 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7974,7 +10568,13 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7990,7 +10590,13 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8007,7 +10613,13 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX1250-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -8035,6 +10647,12 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -8063,6 +10681,12 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -8090,7 +10714,13 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8106,7 +10736,13 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8124,6 +10760,12 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -8147,7 +10789,13 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8163,7 +10811,13 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8179,7 +10833,13 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -8195,7 +10855,13 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -8211,7 +10877,13 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8227,7 +10899,13 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8243,7 +10921,13 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8259,7 +10943,13 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8276,7 +10966,13 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_ret_cmpxchg( ; GFX1250-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -8304,6 +11000,12 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -8332,6 +11034,12 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -8359,7 +11067,13 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_singlethread_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8375,7 +11089,13 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_singlethread_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8393,6 +11113,12 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -8416,7 +11142,13 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8432,7 +11164,13 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8448,7 +11186,13 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_release_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -8464,7 +11208,13 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_singlethread_release_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -8480,7 +11230,13 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_singlethread_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8496,7 +11252,13 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_singlethread_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8512,7 +11274,13 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_singlethread_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8528,7 +11296,13 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_singlethread_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8545,7 +11319,13 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg( ; GFX1250-LABEL: global_singlethread_release_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -8573,6 +11353,12 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -8601,6 +11387,12 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -8628,7 +11420,13 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8644,7 +11442,13 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8662,6 +11466,12 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -8685,7 +11495,13 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8701,7 +11517,13 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8717,7 +11539,13 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -8733,7 +11561,13 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -8749,7 +11583,13 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8765,7 +11605,13 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8781,7 +11627,13 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8797,7 +11649,13 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8814,7 +11672,13 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX1250-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -8842,6 +11706,12 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -8870,6 +11740,12 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -8897,7 +11773,13 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8913,7 +11795,13 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8931,6 +11819,12 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -8954,7 +11848,13 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8970,7 +11870,13 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8986,7 +11892,13 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -9002,7 +11914,13 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -9018,7 +11936,13 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9034,7 +11958,13 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9050,7 +11980,13 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9066,7 +12002,13 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9083,7 +12025,13 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX1250-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -9111,6 +12059,12 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -9139,6 +12093,12 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -9166,7 +12126,13 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9182,7 +12148,13 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9200,6 +12172,12 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -9223,7 +12201,13 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9239,7 +12223,13 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9255,7 +12245,13 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -9271,7 +12267,13 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -9287,7 +12289,13 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9303,7 +12311,13 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9319,7 +12333,13 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9335,7 +12355,13 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9352,7 +12378,13 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -9380,6 +12412,12 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -9408,6 +12446,12 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -9435,7 +12479,13 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9451,7 +12501,13 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9469,6 +12525,12 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -9492,7 +12554,13 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9508,7 +12576,13 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9524,7 +12598,13 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -9540,7 +12620,13 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -9556,7 +12642,13 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9572,7 +12664,13 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9588,7 +12686,13 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9604,7 +12708,13 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9621,7 +12731,13 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -9649,6 +12765,12 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -9677,6 +12799,12 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -9704,7 +12832,13 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9720,7 +12854,13 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9738,6 +12878,12 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -9761,7 +12907,13 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9777,7 +12929,13 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9793,7 +12951,13 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -9809,7 +12973,13 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -9825,7 +12995,13 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9841,7 +13017,13 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9857,7 +13039,13 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9873,7 +13061,13 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9890,7 +13084,13 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -9918,6 +13118,12 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -9946,6 +13152,12 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -9973,7 +13185,13 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9989,7 +13207,13 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10007,6 +13231,12 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -10030,7 +13260,13 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10046,7 +13282,13 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10062,7 +13304,13 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -10078,7 +13326,13 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -10094,7 +13348,13 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10110,7 +13370,13 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10126,7 +13392,13 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10142,7 +13414,13 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10159,7 +13437,13 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -10187,6 +13471,12 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -10215,6 +13505,12 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -10242,7 +13538,13 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10258,7 +13560,13 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10276,6 +13584,12 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -10299,7 +13613,13 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10315,7 +13635,13 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10331,7 +13657,13 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -10347,7 +13679,13 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -10363,7 +13701,13 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10379,7 +13723,13 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10395,7 +13745,13 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10411,7 +13767,13 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10428,7 +13790,13 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -10455,6 +13823,9 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_load( ; GFX6-LABEL: global_singlethread_one_as_unordered_load: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -10482,12 +13853,16 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -10496,29 +13871,38 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_load( ; ; GFX10-WGP-LABEL: global_singlethread_one_as_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_unordered_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -10543,101 +13927,131 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_one_as_unordered_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_singlethread_one_as_unordered_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_one_as_unordered_load: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_one_as_unordered_load: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_singlethread_one_as_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_singlethread_one_as_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_singlethread_one_as_unordered_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -10651,6 +14065,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_load( ; GFX6-LABEL: global_singlethread_one_as_monotonic_load: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -10678,12 +14095,16 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -10692,29 +14113,38 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_load( ; ; GFX10-WGP-LABEL: global_singlethread_one_as_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_monotonic_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -10739,101 +14169,131 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_one_as_monotonic_load: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_one_as_monotonic_load: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_singlethread_one_as_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_singlethread_one_as_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_singlethread_one_as_monotonic_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -10847,6 +14307,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_load( ; GFX6-LABEL: global_singlethread_one_as_acquire_load: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -10874,12 +14337,16 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -10888,29 +14355,38 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_load( ; ; GFX10-WGP-LABEL: global_singlethread_one_as_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acquire_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -10935,101 +14411,131 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_singlethread_one_as_acquire_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_one_as_acquire_load: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_one_as_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_singlethread_one_as_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_singlethread_one_as_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_singlethread_one_as_acquire_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -11043,6 +14549,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_load( ; GFX6-LABEL: global_singlethread_one_as_seq_cst_load: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -11070,12 +14579,16 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -11084,29 +14597,38 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_load( ; ; GFX10-WGP-LABEL: global_singlethread_one_as_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_seq_cst_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -11131,101 +14653,131 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_one_as_seq_cst_load: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_one_as_seq_cst_load: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_singlethread_one_as_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_singlethread_one_as_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_singlethread_one_as_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -11239,6 +14791,9 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_store( ; GFX6-LABEL: global_singlethread_one_as_unordered_store: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -11260,6 +14815,10 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -11270,27 +14829,38 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_store( ; ; GFX10-WGP-LABEL: global_singlethread_one_as_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_unordered_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -11308,93 +14878,129 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_store( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_one_as_unordered_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_singlethread_one_as_unordered_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_one_as_unordered_store: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_one_as_unordered_store: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_singlethread_one_as_unordered_store: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_singlethread_one_as_unordered_store: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_singlethread_one_as_unordered_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -11407,6 +15013,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_store( ; GFX6-LABEL: global_singlethread_one_as_monotonic_store: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -11428,6 +15037,10 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -11438,27 +15051,38 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_store( ; ; GFX10-WGP-LABEL: global_singlethread_one_as_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_monotonic_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -11476,93 +15100,129 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_store( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_one_as_monotonic_store: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_one_as_monotonic_store: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_singlethread_one_as_monotonic_store: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_singlethread_one_as_monotonic_store: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_singlethread_one_as_monotonic_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -11575,6 +15235,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_store( ; GFX6-LABEL: global_singlethread_one_as_release_store: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -11596,6 +15259,10 @@ define amdgpu_kernel void @global_singlethread_one_as_release_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -11606,27 +15273,38 @@ define amdgpu_kernel void @global_singlethread_one_as_release_store( ; ; GFX10-WGP-LABEL: global_singlethread_one_as_release_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_release_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_release_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -11644,93 +15322,129 @@ define amdgpu_kernel void @global_singlethread_one_as_release_store( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_singlethread_one_as_release_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_one_as_release_store: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_one_as_release_store: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_singlethread_one_as_release_store: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_singlethread_one_as_release_store: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_singlethread_one_as_release_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -11743,6 +15457,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_store( ; GFX6-LABEL: global_singlethread_one_as_seq_cst_store: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -11764,6 +15481,10 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -11774,27 +15495,38 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_store( ; ; GFX10-WGP-LABEL: global_singlethread_one_as_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_seq_cst_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -11812,93 +15544,129 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_store( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_one_as_seq_cst_store: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_one_as_seq_cst_store: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_singlethread_one_as_seq_cst_store: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_singlethread_one_as_seq_cst_store: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_singlethread_one_as_seq_cst_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -11911,6 +15679,10 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_atomicrmw( ; GFX6-LABEL: global_singlethread_one_as_monotonic_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -11930,18 +15702,26 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_singlethread_one_as_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11951,7 +15731,11 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_atomicrmw( ; ; GFX10-CU-LABEL: global_singlethread_one_as_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11962,6 +15746,10 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_monotonic_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -11978,7 +15766,11 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11988,7 +15780,11 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11998,7 +15794,11 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12008,7 +15808,11 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12018,7 +15822,11 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_atomicrmw( ; ; GFX11-WGP-LABEL: global_singlethread_one_as_monotonic_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12028,7 +15836,11 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_atomicrmw( ; ; GFX11-CU-LABEL: global_singlethread_one_as_monotonic_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12038,7 +15850,11 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_atomicrmw( ; ; GFX12-WGP-LABEL: global_singlethread_one_as_monotonic_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -12048,7 +15864,11 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_atomicrmw( ; ; GFX12-CU-LABEL: global_singlethread_one_as_monotonic_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -12059,7 +15879,11 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_atomicrmw( ; GFX1250-LABEL: global_singlethread_one_as_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -12077,6 +15901,10 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_atomicrmw( ; GFX6-LABEL: global_singlethread_one_as_acquire_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -12096,18 +15924,26 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_singlethread_one_as_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12117,7 +15953,11 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_atomicrmw( ; ; GFX10-CU-LABEL: global_singlethread_one_as_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12128,6 +15968,10 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acquire_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -12144,7 +15988,11 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12154,7 +16002,11 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12164,7 +16016,11 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12174,7 +16030,11 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_singlethread_one_as_acquire_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12184,7 +16044,11 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_atomicrmw( ; ; GFX11-WGP-LABEL: global_singlethread_one_as_acquire_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12194,7 +16058,11 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_atomicrmw( ; ; GFX11-CU-LABEL: global_singlethread_one_as_acquire_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12204,7 +16072,11 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_atomicrmw( ; ; GFX12-WGP-LABEL: global_singlethread_one_as_acquire_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -12214,7 +16086,11 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_atomicrmw( ; ; GFX12-CU-LABEL: global_singlethread_one_as_acquire_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -12225,7 +16101,11 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_atomicrmw( ; GFX1250-LABEL: global_singlethread_one_as_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -12243,6 +16123,10 @@ define amdgpu_kernel void @global_singlethread_one_as_release_atomicrmw( ; GFX6-LABEL: global_singlethread_one_as_release_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -12262,18 +16146,26 @@ define amdgpu_kernel void @global_singlethread_one_as_release_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_singlethread_one_as_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12283,7 +16175,11 @@ define amdgpu_kernel void @global_singlethread_one_as_release_atomicrmw( ; ; GFX10-CU-LABEL: global_singlethread_one_as_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12294,6 +16190,10 @@ define amdgpu_kernel void @global_singlethread_one_as_release_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_release_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -12310,7 +16210,11 @@ define amdgpu_kernel void @global_singlethread_one_as_release_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12320,7 +16224,11 @@ define amdgpu_kernel void @global_singlethread_one_as_release_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12330,7 +16238,11 @@ define amdgpu_kernel void @global_singlethread_one_as_release_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12340,7 +16252,11 @@ define amdgpu_kernel void @global_singlethread_one_as_release_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_singlethread_one_as_release_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12350,7 +16266,11 @@ define amdgpu_kernel void @global_singlethread_one_as_release_atomicrmw( ; ; GFX11-WGP-LABEL: global_singlethread_one_as_release_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12360,7 +16280,11 @@ define amdgpu_kernel void @global_singlethread_one_as_release_atomicrmw( ; ; GFX11-CU-LABEL: global_singlethread_one_as_release_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12370,7 +16294,11 @@ define amdgpu_kernel void @global_singlethread_one_as_release_atomicrmw( ; ; GFX12-WGP-LABEL: global_singlethread_one_as_release_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -12380,7 +16308,11 @@ define amdgpu_kernel void @global_singlethread_one_as_release_atomicrmw( ; ; GFX12-CU-LABEL: global_singlethread_one_as_release_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -12391,7 +16323,11 @@ define amdgpu_kernel void @global_singlethread_one_as_release_atomicrmw( ; GFX1250-LABEL: global_singlethread_one_as_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -12409,6 +16345,10 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_atomicrmw( ; GFX6-LABEL: global_singlethread_one_as_acq_rel_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -12428,18 +16368,26 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_singlethread_one_as_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12449,7 +16397,11 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_atomicrmw( ; ; GFX10-CU-LABEL: global_singlethread_one_as_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12460,6 +16412,10 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acq_rel_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -12476,7 +16432,11 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12486,7 +16446,11 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12496,7 +16460,11 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12506,7 +16474,11 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12516,7 +16488,11 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_atomicrmw( ; ; GFX11-WGP-LABEL: global_singlethread_one_as_acq_rel_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12526,7 +16502,11 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_atomicrmw( ; ; GFX11-CU-LABEL: global_singlethread_one_as_acq_rel_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12536,7 +16516,11 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_atomicrmw( ; ; GFX12-WGP-LABEL: global_singlethread_one_as_acq_rel_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -12546,7 +16530,11 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_atomicrmw( ; ; GFX12-CU-LABEL: global_singlethread_one_as_acq_rel_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -12557,7 +16545,11 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_atomicrmw( ; GFX1250-LABEL: global_singlethread_one_as_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -12575,6 +16567,10 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_atomicrmw( ; GFX6-LABEL: global_singlethread_one_as_seq_cst_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -12594,18 +16590,26 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_singlethread_one_as_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12615,7 +16619,11 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_atomicrmw( ; ; GFX10-CU-LABEL: global_singlethread_one_as_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12626,6 +16634,10 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_seq_cst_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -12642,7 +16654,11 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12652,7 +16668,11 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12662,7 +16682,11 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12672,7 +16696,11 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12682,7 +16710,11 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_atomicrmw( ; ; GFX11-WGP-LABEL: global_singlethread_one_as_seq_cst_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12692,7 +16724,11 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_atomicrmw( ; ; GFX11-CU-LABEL: global_singlethread_one_as_seq_cst_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12702,7 +16738,11 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_atomicrmw( ; ; GFX12-WGP-LABEL: global_singlethread_one_as_seq_cst_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -12712,7 +16752,11 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_atomicrmw( ; ; GFX12-CU-LABEL: global_singlethread_one_as_seq_cst_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -12723,7 +16767,11 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_atomicrmw( ; GFX1250-LABEL: global_singlethread_one_as_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -12741,6 +16789,10 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_ret_atomicrmw( ; GFX6-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -12763,6 +16815,10 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_ret_atomicrmw( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -12777,7 +16833,11 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_ret_atomicrmw( ; ; GFX10-WGP-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12789,7 +16849,11 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_ret_atomicrmw( ; ; GFX10-CU-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12802,6 +16866,10 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -12820,7 +16888,11 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12832,7 +16904,11 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12844,7 +16920,11 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_ret_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12856,7 +16936,11 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_ret_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12868,7 +16952,11 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_ret_atomicrmw( ; ; GFX11-WGP-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12880,7 +16968,11 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_ret_atomicrmw( ; ; GFX11-CU-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12892,7 +16984,11 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_ret_atomicrmw( ; ; GFX12-WGP-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -12904,7 +17000,11 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_ret_atomicrmw( ; ; GFX12-CU-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -12917,7 +17017,11 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_ret_atomicrmw( ; GFX1250-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -12938,6 +17042,10 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX6-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -12960,6 +17068,10 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -12974,7 +17086,11 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_ret_atomicrmw( ; ; GFX10-WGP-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12986,7 +17102,11 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_ret_atomicrmw( ; ; GFX10-CU-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12999,6 +17119,10 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -13017,7 +17141,11 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13029,7 +17157,11 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13041,7 +17173,11 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_ret_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13053,7 +17189,11 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_ret_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13065,7 +17205,11 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_ret_atomicrmw( ; ; GFX11-WGP-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -13077,7 +17221,11 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_ret_atomicrmw( ; ; GFX11-CU-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -13089,7 +17237,11 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_ret_atomicrmw( ; ; GFX12-WGP-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -13101,7 +17253,11 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_ret_atomicrmw( ; ; GFX12-CU-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -13114,7 +17270,11 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX1250-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -13135,6 +17295,10 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX6-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -13157,6 +17321,10 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -13171,7 +17339,11 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_ret_atomicrmw( ; ; GFX10-WGP-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -13183,7 +17355,11 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_ret_atomicrmw( ; ; GFX10-CU-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -13196,6 +17372,10 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -13214,7 +17394,11 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13226,7 +17410,11 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13238,7 +17426,11 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_ret_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13250,7 +17442,11 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_ret_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13262,7 +17458,11 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_ret_atomicrmw( ; ; GFX11-WGP-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -13274,7 +17474,11 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_ret_atomicrmw( ; ; GFX11-CU-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -13286,7 +17490,11 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_ret_atomicrmw( ; ; GFX12-WGP-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -13298,7 +17506,11 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_ret_atomicrmw( ; ; GFX12-CU-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -13311,7 +17523,11 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX1250-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -13333,6 +17549,12 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_cmpxch ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -13357,11 +17579,16 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_cmpxch ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -13371,6 +17598,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_cmpxch ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -13381,7 +17609,13 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_cmpxch ; ; GFX10-WGP-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13395,7 +17629,13 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_cmpxch ; ; GFX10-CU-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13411,6 +17651,12 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_cmpxch ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -13431,7 +17677,13 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_cmpxch ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13445,7 +17697,13 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_cmpxch ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13459,7 +17717,13 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_cmpxch ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -13473,7 +17737,13 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_cmpxch ; ; GFX942-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -13487,7 +17757,13 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_cmpxch ; ; GFX11-WGP-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13501,7 +17777,13 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_cmpxch ; ; GFX11-CU-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13515,7 +17797,13 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_cmpxch ; ; GFX12-WGP-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13529,7 +17817,13 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_cmpxch ; ; GFX12-CU-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13544,7 +17838,13 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_cmpxch ; GFX1250-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -13568,6 +17868,12 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -13592,11 +17898,16 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -13606,6 +17917,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -13616,7 +17928,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13630,7 +17948,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13646,6 +17970,12 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -13666,7 +17996,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13680,7 +18016,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13694,7 +18036,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -13708,7 +18056,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -13722,7 +18076,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13736,7 +18096,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13750,7 +18116,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13764,7 +18136,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13779,7 +18157,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX1250-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -13803,6 +18187,12 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -13827,11 +18217,16 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -13841,6 +18236,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -13851,7 +18247,13 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13865,7 +18267,13 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13881,6 +18289,12 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -13901,7 +18315,13 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13915,7 +18335,13 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13929,7 +18355,13 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -13943,7 +18375,13 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -13957,7 +18395,13 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13971,7 +18415,13 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13985,7 +18435,13 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13999,7 +18455,13 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14014,7 +18476,13 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg( ; GFX1250-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -14038,6 +18506,12 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -14062,11 +18536,16 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -14076,6 +18555,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -14086,7 +18566,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14100,7 +18586,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14116,6 +18608,12 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -14136,7 +18634,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14150,7 +18654,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14164,7 +18674,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -14178,7 +18694,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -14192,7 +18714,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14206,7 +18734,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14220,7 +18754,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14234,7 +18774,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14249,7 +18795,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX1250-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -14273,6 +18825,12 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -14297,11 +18855,16 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -14311,6 +18874,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -14321,7 +18885,13 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14335,7 +18905,13 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14351,6 +18927,12 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -14371,7 +18953,13 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14385,7 +18973,13 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14399,7 +18993,13 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -14413,7 +19013,13 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -14427,7 +19033,13 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14441,7 +19053,13 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14455,7 +19073,13 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14469,7 +19093,13 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14484,7 +19114,13 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX1250-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -14508,6 +19144,12 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -14532,11 +19174,16 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -14546,6 +19193,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -14556,7 +19204,13 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14570,7 +19224,13 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14586,6 +19246,12 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -14606,7 +19272,13 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14620,7 +19292,13 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14634,7 +19312,13 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -14648,7 +19332,13 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -14662,7 +19352,13 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14676,7 +19372,13 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14690,7 +19392,13 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14704,7 +19412,13 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14719,7 +19433,13 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX1250-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -14743,6 +19463,12 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -14767,11 +19493,16 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -14781,6 +19512,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -14791,7 +19523,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14805,7 +19543,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14821,6 +19565,12 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -14841,7 +19591,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14855,7 +19611,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14869,7 +19631,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -14883,7 +19651,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -14897,7 +19671,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14911,7 +19691,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14925,7 +19711,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14939,7 +19731,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14954,7 +19752,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX1250-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -14978,6 +19782,12 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15002,11 +19812,16 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -15016,6 +19831,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -15026,7 +19842,13 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15040,7 +19862,13 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15056,6 +19884,12 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -15076,7 +19910,13 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15090,7 +19930,13 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15104,7 +19950,13 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -15118,7 +19970,13 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -15132,7 +19990,13 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15146,7 +20010,13 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15160,7 +20030,13 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15174,7 +20050,13 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15189,7 +20071,13 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg( ; GFX1250-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -15213,6 +20101,12 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15237,11 +20131,16 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -15251,6 +20150,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -15261,7 +20161,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15275,7 +20181,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15291,6 +20203,12 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -15311,7 +20229,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15325,7 +20249,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15339,7 +20269,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -15353,7 +20289,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -15367,7 +20309,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15381,7 +20329,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15395,7 +20349,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15409,7 +20369,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15424,7 +20390,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX1250-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -15448,6 +20420,12 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15472,11 +20450,16 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -15486,6 +20469,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -15496,7 +20480,13 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15510,7 +20500,13 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15526,6 +20522,12 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -15546,7 +20548,13 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15560,7 +20568,13 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15574,7 +20588,13 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -15588,7 +20608,13 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -15602,7 +20628,13 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15616,7 +20648,13 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15630,7 +20668,13 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15644,7 +20688,13 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15659,7 +20709,13 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX1250-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -15683,6 +20739,12 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15707,11 +20769,16 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -15721,6 +20788,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -15731,7 +20799,13 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15745,7 +20819,13 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15761,6 +20841,12 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -15781,7 +20867,13 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15795,7 +20887,13 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15809,7 +20907,13 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -15823,7 +20927,13 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -15837,7 +20947,13 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15851,7 +20967,13 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15865,7 +20987,13 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15879,7 +21007,13 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15894,7 +21028,13 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX1250-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -15918,6 +21058,12 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15942,11 +21088,16 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -15956,6 +21107,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -15966,7 +21118,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15980,7 +21138,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15996,6 +21160,12 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16016,7 +21186,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16030,7 +21206,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16044,7 +21226,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -16058,7 +21246,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -16072,7 +21266,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16086,7 +21286,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16100,7 +21306,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16114,7 +21326,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16129,7 +21347,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX1250-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -16153,6 +21377,12 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -16177,11 +21407,16 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -16191,6 +21426,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -16201,7 +21437,13 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16215,7 +21457,13 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16231,6 +21479,12 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16251,7 +21505,13 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16265,7 +21525,13 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16279,7 +21545,13 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -16293,7 +21565,13 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -16307,7 +21585,13 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16321,7 +21605,13 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16335,7 +21625,13 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16349,7 +21645,13 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16364,7 +21666,13 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX1250-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -16388,6 +21696,12 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -16412,11 +21726,16 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -16426,6 +21745,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -16436,7 +21756,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16450,7 +21776,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16466,6 +21798,12 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16486,7 +21824,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16500,7 +21844,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16514,7 +21864,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -16528,7 +21884,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -16542,7 +21904,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16556,7 +21924,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16570,7 +21944,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16584,7 +21964,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16599,7 +21985,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX1250-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -16623,6 +22015,12 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -16647,11 +22045,16 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -16661,6 +22064,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -16671,7 +22075,13 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16685,7 +22095,13 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16701,6 +22117,12 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16721,7 +22143,13 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16735,7 +22163,13 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16749,7 +22183,13 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -16763,7 +22203,13 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -16777,7 +22223,13 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16791,7 +22243,13 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16805,7 +22263,13 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16819,7 +22283,13 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16834,7 +22304,13 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX1250-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -16858,6 +22334,12 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_ret_cm ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -16886,6 +22368,12 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_ret_cm ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -16913,7 +22401,13 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_ret_cm ; ; GFX10-WGP-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16929,7 +22423,13 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_ret_cm ; ; GFX10-CU-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16947,6 +22447,12 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_ret_cm ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16970,7 +22476,13 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_ret_cm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16986,7 +22498,13 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_ret_cm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17002,7 +22520,13 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_ret_cm ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -17018,7 +22542,13 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_ret_cm ; ; GFX942-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -17034,7 +22564,13 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_ret_cm ; ; GFX11-WGP-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17050,7 +22586,13 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_ret_cm ; ; GFX11-CU-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17066,7 +22608,13 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_ret_cm ; ; GFX12-WGP-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17082,7 +22630,13 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_ret_cm ; ; GFX12-CU-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17099,7 +22653,13 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_ret_cm ; GFX1250-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -17127,6 +22687,12 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_ret_cmpx ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -17155,6 +22721,12 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_ret_cmpx ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -17182,7 +22754,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_ret_cmpx ; ; GFX10-WGP-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17198,7 +22776,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_ret_cmpx ; ; GFX10-CU-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17216,6 +22800,12 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_ret_cmpx ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -17239,7 +22829,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_ret_cmpx ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17255,7 +22851,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_ret_cmpx ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17271,7 +22873,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_ret_cmpx ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -17287,7 +22895,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_ret_cmpx ; ; GFX942-TGSPLIT-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -17303,7 +22917,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_ret_cmpx ; ; GFX11-WGP-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17319,7 +22939,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_ret_cmpx ; ; GFX11-CU-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17335,7 +22961,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_ret_cmpx ; ; GFX12-WGP-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17351,7 +22983,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_ret_cmpx ; ; GFX12-CU-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17368,7 +23006,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_ret_cmpx ; GFX1250-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -17396,6 +23040,12 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpx ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -17424,6 +23074,12 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpx ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -17451,7 +23107,13 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpx ; ; GFX10-WGP-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17467,7 +23129,13 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpx ; ; GFX10-CU-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17485,6 +23153,12 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpx ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -17508,7 +23182,13 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpx ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17524,7 +23204,13 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpx ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17540,7 +23226,13 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpx ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -17556,7 +23248,13 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpx ; ; GFX942-TGSPLIT-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -17572,7 +23270,13 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpx ; ; GFX11-WGP-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17588,7 +23292,13 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpx ; ; GFX11-CU-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17604,7 +23314,13 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpx ; ; GFX12-WGP-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17620,7 +23336,13 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpx ; ; GFX12-CU-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17637,7 +23359,13 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpx ; GFX1250-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -17665,6 +23393,12 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -17693,6 +23427,12 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -17720,7 +23460,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx ; ; GFX10-WGP-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17736,7 +23482,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx ; ; GFX10-CU-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17754,6 +23506,12 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -17777,7 +23535,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17793,7 +23557,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17809,7 +23579,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -17825,7 +23601,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx ; ; GFX942-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -17841,7 +23623,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx ; ; GFX11-WGP-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17857,7 +23645,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx ; ; GFX11-CU-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17873,7 +23667,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx ; ; GFX12-WGP-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17889,7 +23689,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx ; ; GFX12-CU-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17906,7 +23712,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx ; GFX1250-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -17934,6 +23746,12 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -17962,6 +23780,12 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -17989,7 +23813,13 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx ; ; GFX10-WGP-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18005,7 +23835,13 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx ; ; GFX10-CU-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18023,6 +23859,12 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -18046,7 +23888,13 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18062,7 +23910,13 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18078,7 +23932,13 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -18094,7 +23954,13 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx ; ; GFX942-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -18110,7 +23976,13 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx ; ; GFX11-WGP-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18126,7 +23998,13 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx ; ; GFX11-CU-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18142,7 +24020,13 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx ; ; GFX12-WGP-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18158,7 +24042,13 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx ; ; GFX12-CU-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18175,7 +24065,13 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx ; GFX1250-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -18203,6 +24099,12 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_ret_cmpx ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -18231,6 +24133,12 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_ret_cmpx ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -18258,7 +24166,13 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_ret_cmpx ; ; GFX10-WGP-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18274,7 +24188,13 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_ret_cmpx ; ; GFX10-CU-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18292,6 +24212,12 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_ret_cmpx ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -18315,7 +24241,13 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_ret_cmpx ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18331,7 +24263,13 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_ret_cmpx ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18347,7 +24285,13 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_ret_cmpx ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -18363,7 +24307,13 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_ret_cmpx ; ; GFX942-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -18379,7 +24329,13 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_ret_cmpx ; ; GFX11-WGP-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18395,7 +24351,13 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_ret_cmpx ; ; GFX11-CU-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18411,7 +24373,13 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_ret_cmpx ; ; GFX12-WGP-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18427,7 +24395,13 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_ret_cmpx ; ; GFX12-CU-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18444,7 +24418,13 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_ret_cmpx ; GFX1250-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -18472,6 +24452,12 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_ret_cmpxch ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -18500,6 +24486,12 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_ret_cmpxch ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -18527,7 +24519,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_ret_cmpxch ; ; GFX10-WGP-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18543,7 +24541,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_ret_cmpxch ; ; GFX10-CU-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18561,6 +24565,12 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_ret_cmpxch ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -18584,7 +24594,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_ret_cmpxch ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18600,7 +24616,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_ret_cmpxch ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18616,7 +24638,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_ret_cmpxch ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -18632,7 +24660,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_ret_cmpxch ; ; GFX942-TGSPLIT-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -18648,7 +24682,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_ret_cmpxch ; ; GFX11-WGP-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18664,7 +24704,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_ret_cmpxch ; ; GFX11-CU-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18680,7 +24726,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_ret_cmpxch ; ; GFX12-WGP-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18696,7 +24748,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_ret_cmpxch ; ; GFX12-CU-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18713,7 +24771,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_ret_cmpxch ; GFX1250-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -18741,6 +24805,12 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -18769,6 +24839,12 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -18796,7 +24872,13 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch ; ; GFX10-WGP-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18812,7 +24894,13 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch ; ; GFX10-CU-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18830,6 +24918,12 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -18853,7 +24947,13 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18869,7 +24969,13 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18885,7 +24991,13 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -18901,7 +25013,13 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch ; ; GFX942-TGSPLIT-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -18917,7 +25035,13 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch ; ; GFX11-WGP-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18933,7 +25057,13 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch ; ; GFX11-CU-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18949,7 +25079,13 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch ; ; GFX12-WGP-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18965,7 +25101,13 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch ; ; GFX12-CU-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18982,7 +25124,13 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch ; GFX1250-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -19010,6 +25158,12 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -19038,6 +25192,12 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -19065,7 +25225,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch ; ; GFX10-WGP-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19081,7 +25247,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch ; ; GFX10-CU-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19099,6 +25271,12 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -19122,7 +25300,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19138,7 +25322,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19154,7 +25344,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -19170,7 +25366,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch ; ; GFX942-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -19186,7 +25388,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch ; ; GFX11-WGP-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19202,7 +25410,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch ; ; GFX11-CU-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19218,7 +25432,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch ; ; GFX12-WGP-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19234,7 +25454,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch ; ; GFX12-CU-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19251,7 +25477,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch ; GFX1250-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -19279,6 +25511,12 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -19307,6 +25545,12 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -19334,7 +25578,13 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch ; ; GFX10-WGP-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19350,7 +25600,13 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch ; ; GFX10-CU-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19368,6 +25624,12 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -19391,7 +25653,13 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19407,7 +25675,13 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19423,7 +25697,13 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -19439,7 +25719,13 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch ; ; GFX942-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -19455,7 +25741,13 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch ; ; GFX11-WGP-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19471,7 +25763,13 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch ; ; GFX11-CU-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19487,7 +25785,13 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch ; ; GFX12-WGP-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19503,7 +25807,13 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch ; ; GFX12-CU-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19520,7 +25830,13 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch ; GFX1250-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -19548,6 +25864,12 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpx ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -19576,6 +25898,12 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpx ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -19603,7 +25931,13 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpx ; ; GFX10-WGP-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19619,7 +25953,13 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpx ; ; GFX10-CU-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19637,6 +25977,12 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpx ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -19660,7 +26006,13 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpx ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19676,7 +26028,13 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpx ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19692,7 +26050,13 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpx ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -19708,7 +26072,13 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpx ; ; GFX942-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -19724,7 +26094,13 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpx ; ; GFX11-WGP-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19740,7 +26116,13 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpx ; ; GFX11-CU-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19756,7 +26138,13 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpx ; ; GFX12-WGP-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19772,7 +26160,13 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpx ; ; GFX12-CU-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19789,7 +26183,13 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpx ; GFX1250-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -19817,6 +26217,12 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxch ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -19845,6 +26251,12 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxch ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -19872,7 +26284,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxch ; ; GFX10-WGP-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19888,7 +26306,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxch ; ; GFX10-CU-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19906,6 +26330,12 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxch ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -19929,7 +26359,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxch ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19945,7 +26381,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxch ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19961,7 +26403,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxch ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -19977,7 +26425,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxch ; ; GFX942-TGSPLIT-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -19993,7 +26447,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxch ; ; GFX11-WGP-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20009,7 +26469,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxch ; ; GFX11-CU-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20025,7 +26491,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxch ; ; GFX12-WGP-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20041,7 +26513,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxch ; ; GFX12-CU-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20058,7 +26536,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxch ; GFX1250-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -20086,6 +26570,12 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxch ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -20114,6 +26604,12 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxch ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -20141,7 +26637,13 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxch ; ; GFX10-WGP-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20157,7 +26659,13 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxch ; ; GFX10-CU-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20175,6 +26683,12 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxch ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -20198,7 +26712,13 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxch ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20214,7 +26734,13 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxch ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20230,7 +26756,13 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxch ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -20246,7 +26778,13 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxch ; ; GFX942-TGSPLIT-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -20262,7 +26800,13 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxch ; ; GFX11-WGP-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20278,7 +26822,13 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxch ; ; GFX11-CU-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20294,7 +26844,13 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxch ; ; GFX12-WGP-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20310,7 +26866,13 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxch ; ; GFX12-CU-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20327,7 +26889,13 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxch ; GFX1250-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -20355,6 +26923,12 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxch ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -20383,6 +26957,12 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxch ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -20410,7 +26990,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxch ; ; GFX10-WGP-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20426,7 +27012,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxch ; ; GFX10-CU-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20444,6 +27036,12 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxch ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -20467,7 +27065,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxch ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20483,7 +27087,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxch ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20499,7 +27109,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxch ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -20515,7 +27131,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxch ; ; GFX942-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -20531,7 +27153,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxch ; ; GFX11-WGP-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20547,7 +27175,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxch ; ; GFX11-CU-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20563,7 +27197,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxch ; ; GFX12-WGP-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20579,7 +27219,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxch ; ; GFX12-CU-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20596,7 +27242,13 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxch ; GFX1250-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -20624,6 +27276,12 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -20652,6 +27310,12 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -20679,7 +27343,13 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch ; ; GFX10-WGP-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20695,7 +27365,13 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch ; ; GFX10-CU-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20713,6 +27389,12 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -20736,7 +27418,13 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20752,7 +27440,13 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20768,7 +27462,13 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -20784,7 +27484,13 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch ; ; GFX942-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -20800,7 +27506,13 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch ; ; GFX11-WGP-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20816,7 +27528,13 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch ; ; GFX11-CU-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20832,7 +27550,13 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch ; ; GFX12-WGP-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20848,7 +27572,13 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch ; ; GFX12-CU-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20865,7 +27595,13 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch ; GFX1250-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll index ab6c2820a522..e6cad083d94d 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll @@ -18,6 +18,9 @@ define amdgpu_kernel void @global_system_unordered_load( ; GFX6-LABEL: global_system_unordered_load: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -45,12 +48,16 @@ define amdgpu_kernel void @global_system_unordered_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -59,29 +66,38 @@ define amdgpu_kernel void @global_system_unordered_load( ; ; GFX10-WGP-LABEL: global_system_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_unordered_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -106,101 +122,131 @@ define amdgpu_kernel void @global_system_unordered_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_system_unordered_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_system_unordered_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_system_unordered_load: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_system_unordered_load: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_system_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_system_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_system_unordered_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -214,6 +260,9 @@ define amdgpu_kernel void @global_system_monotonic_load( ; GFX6-LABEL: global_system_monotonic_load: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -241,12 +290,16 @@ define amdgpu_kernel void @global_system_monotonic_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] glc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -255,29 +308,38 @@ define amdgpu_kernel void @global_system_monotonic_load( ; ; GFX10-WGP-LABEL: global_system_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] glc dlc -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] glc dlc -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_monotonic_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -302,101 +364,131 @@ define amdgpu_kernel void @global_system_monotonic_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_system_monotonic_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc0 sc1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_system_monotonic_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc0 sc1 -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_system_monotonic_load: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] glc -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_system_monotonic_load: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] glc -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_system_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_system_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_system_monotonic_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -410,6 +502,9 @@ define amdgpu_kernel void @global_system_acquire_load( ; GFX6-LABEL: global_system_acquire_load: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -438,14 +533,18 @@ define amdgpu_kernel void @global_system_acquire_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: flat_store_dword v[0:1], v2 @@ -453,33 +552,44 @@ define amdgpu_kernel void @global_system_acquire_load( ; ; GFX10-WGP-LABEL: global_system_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_acquire_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -504,115 +614,151 @@ define amdgpu_kernel void @global_system_acquire_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_system_acquire_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc0 sc1 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_system_acquire_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc0 sc1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_system_acquire_load: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_system_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_system_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_system_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_system_acquire_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -626,6 +772,9 @@ define amdgpu_kernel void @global_system_seq_cst_load( ; GFX6-LABEL: global_system_seq_cst_load: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -655,9 +804,12 @@ define amdgpu_kernel void @global_system_seq_cst_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -671,8 +823,12 @@ define amdgpu_kernel void @global_system_seq_cst_load( ; ; GFX10-WGP-LABEL: global_system_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -685,8 +841,12 @@ define amdgpu_kernel void @global_system_seq_cst_load( ; ; GFX10-CU-LABEL: global_system_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -700,6 +860,9 @@ define amdgpu_kernel void @global_system_seq_cst_load( ; SKIP-CACHE-INV-LABEL: global_system_seq_cst_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -725,8 +888,12 @@ define amdgpu_kernel void @global_system_seq_cst_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc @@ -738,8 +905,12 @@ define amdgpu_kernel void @global_system_seq_cst_load( ; ; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc @@ -751,8 +922,12 @@ define amdgpu_kernel void @global_system_seq_cst_load( ; ; GFX942-NOTTGSPLIT-LABEL: global_system_seq_cst_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc0 sc1 @@ -763,8 +938,12 @@ define amdgpu_kernel void @global_system_seq_cst_load( ; ; GFX942-TGSPLIT-LABEL: global_system_seq_cst_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc0 sc1 @@ -775,8 +954,12 @@ define amdgpu_kernel void @global_system_seq_cst_load( ; ; GFX11-WGP-LABEL: global_system_seq_cst_load: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -789,8 +972,12 @@ define amdgpu_kernel void @global_system_seq_cst_load( ; ; GFX11-CU-LABEL: global_system_seq_cst_load: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -803,49 +990,61 @@ define amdgpu_kernel void @global_system_seq_cst_load( ; ; GFX12-WGP-LABEL: global_system_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_system_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_system_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -859,6 +1058,9 @@ define amdgpu_kernel void @global_system_unordered_store( ; GFX6-LABEL: global_system_unordered_store: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -880,6 +1082,10 @@ define amdgpu_kernel void @global_system_unordered_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -890,27 +1096,38 @@ define amdgpu_kernel void @global_system_unordered_store( ; ; GFX10-WGP-LABEL: global_system_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_unordered_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -928,93 +1145,129 @@ define amdgpu_kernel void @global_system_unordered_store( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_system_unordered_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_system_unordered_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_system_unordered_store: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_system_unordered_store: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_system_unordered_store: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_system_unordered_store: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_system_unordered_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -1027,6 +1280,9 @@ define amdgpu_kernel void @global_system_monotonic_store( ; GFX6-LABEL: global_system_monotonic_store: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -1048,6 +1304,10 @@ define amdgpu_kernel void @global_system_monotonic_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -1058,27 +1318,38 @@ define amdgpu_kernel void @global_system_monotonic_store( ; ; GFX10-WGP-LABEL: global_system_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_monotonic_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -1096,93 +1367,129 @@ define amdgpu_kernel void @global_system_monotonic_store( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_system_monotonic_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_system_monotonic_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_system_monotonic_store: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_system_monotonic_store: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_system_monotonic_store: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_system_monotonic_store: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_system_monotonic_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -1195,6 +1502,9 @@ define amdgpu_kernel void @global_system_release_store( ; GFX6-LABEL: global_system_release_store: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -1217,6 +1527,10 @@ define amdgpu_kernel void @global_system_release_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -1228,10 +1542,13 @@ define amdgpu_kernel void @global_system_release_store( ; ; GFX10-WGP-LABEL: global_system_release_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1240,10 +1557,13 @@ define amdgpu_kernel void @global_system_release_store( ; ; GFX10-CU-LABEL: global_system_release_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1253,6 +1573,9 @@ define amdgpu_kernel void @global_system_release_store( ; SKIP-CACHE-INV-LABEL: global_system_release_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -1271,10 +1594,13 @@ define amdgpu_kernel void @global_system_release_store( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1283,10 +1609,13 @@ define amdgpu_kernel void @global_system_release_store( ; ; GFX90A-TGSPLIT-LABEL: global_system_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1295,10 +1624,13 @@ define amdgpu_kernel void @global_system_release_store( ; ; GFX942-NOTTGSPLIT-LABEL: global_system_release_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1307,10 +1639,13 @@ define amdgpu_kernel void @global_system_release_store( ; ; GFX942-TGSPLIT-LABEL: global_system_release_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1319,10 +1654,13 @@ define amdgpu_kernel void @global_system_release_store( ; ; GFX11-WGP-LABEL: global_system_release_store: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1331,10 +1669,13 @@ define amdgpu_kernel void @global_system_release_store( ; ; GFX11-CU-LABEL: global_system_release_store: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1343,41 +1684,52 @@ define amdgpu_kernel void @global_system_release_store( ; ; GFX12-WGP-LABEL: global_system_release_store: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_system_release_store: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_system_release_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -1385,6 +1737,7 @@ define amdgpu_kernel void @global_system_release_store( ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -1397,6 +1750,9 @@ define amdgpu_kernel void @global_system_seq_cst_store( ; GFX6-LABEL: global_system_seq_cst_store: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -1419,6 +1775,10 @@ define amdgpu_kernel void @global_system_seq_cst_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -1430,10 +1790,13 @@ define amdgpu_kernel void @global_system_seq_cst_store( ; ; GFX10-WGP-LABEL: global_system_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1442,10 +1805,13 @@ define amdgpu_kernel void @global_system_seq_cst_store( ; ; GFX10-CU-LABEL: global_system_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1455,6 +1821,9 @@ define amdgpu_kernel void @global_system_seq_cst_store( ; SKIP-CACHE-INV-LABEL: global_system_seq_cst_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -1473,10 +1842,13 @@ define amdgpu_kernel void @global_system_seq_cst_store( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1485,10 +1857,13 @@ define amdgpu_kernel void @global_system_seq_cst_store( ; ; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1497,10 +1872,13 @@ define amdgpu_kernel void @global_system_seq_cst_store( ; ; GFX942-NOTTGSPLIT-LABEL: global_system_seq_cst_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1509,10 +1887,13 @@ define amdgpu_kernel void @global_system_seq_cst_store( ; ; GFX942-TGSPLIT-LABEL: global_system_seq_cst_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1521,10 +1902,13 @@ define amdgpu_kernel void @global_system_seq_cst_store( ; ; GFX11-WGP-LABEL: global_system_seq_cst_store: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1533,10 +1917,13 @@ define amdgpu_kernel void @global_system_seq_cst_store( ; ; GFX11-CU-LABEL: global_system_seq_cst_store: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1545,41 +1932,52 @@ define amdgpu_kernel void @global_system_seq_cst_store( ; ; GFX12-WGP-LABEL: global_system_seq_cst_store: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_system_seq_cst_store: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_system_seq_cst_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -1587,6 +1985,7 @@ define amdgpu_kernel void @global_system_seq_cst_store( ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -1599,6 +1998,10 @@ define amdgpu_kernel void @global_system_monotonic_atomicrmw( ; GFX6-LABEL: global_system_monotonic_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -1618,18 +2021,26 @@ define amdgpu_kernel void @global_system_monotonic_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_system_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1639,7 +2050,11 @@ define amdgpu_kernel void @global_system_monotonic_atomicrmw( ; ; GFX10-CU-LABEL: global_system_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1650,6 +2065,10 @@ define amdgpu_kernel void @global_system_monotonic_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_system_monotonic_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -1666,7 +2085,11 @@ define amdgpu_kernel void @global_system_monotonic_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1676,7 +2099,11 @@ define amdgpu_kernel void @global_system_monotonic_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_system_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1686,7 +2113,11 @@ define amdgpu_kernel void @global_system_monotonic_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_system_monotonic_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1696,7 +2127,11 @@ define amdgpu_kernel void @global_system_monotonic_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_system_monotonic_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1706,7 +2141,11 @@ define amdgpu_kernel void @global_system_monotonic_atomicrmw( ; ; GFX11-WGP-LABEL: global_system_monotonic_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1716,7 +2155,11 @@ define amdgpu_kernel void @global_system_monotonic_atomicrmw( ; ; GFX11-CU-LABEL: global_system_monotonic_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1726,7 +2169,11 @@ define amdgpu_kernel void @global_system_monotonic_atomicrmw( ; ; GFX12-WGP-LABEL: global_system_monotonic_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -1736,7 +2183,11 @@ define amdgpu_kernel void @global_system_monotonic_atomicrmw( ; ; GFX12-CU-LABEL: global_system_monotonic_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -1747,7 +2198,11 @@ define amdgpu_kernel void @global_system_monotonic_atomicrmw( ; GFX1250-LABEL: global_system_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -1765,6 +2220,10 @@ define amdgpu_kernel void @global_system_acquire_atomicrmw( ; GFX6-LABEL: global_system_acquire_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -1786,11 +2245,15 @@ define amdgpu_kernel void @global_system_acquire_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -1799,7 +2262,11 @@ define amdgpu_kernel void @global_system_acquire_atomicrmw( ; ; GFX10-WGP-LABEL: global_system_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1812,7 +2279,11 @@ define amdgpu_kernel void @global_system_acquire_atomicrmw( ; ; GFX10-CU-LABEL: global_system_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1826,6 +2297,10 @@ define amdgpu_kernel void @global_system_acquire_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_system_acquire_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -1843,7 +2318,11 @@ define amdgpu_kernel void @global_system_acquire_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1856,7 +2335,11 @@ define amdgpu_kernel void @global_system_acquire_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_system_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1869,7 +2352,11 @@ define amdgpu_kernel void @global_system_acquire_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_system_acquire_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1881,7 +2368,11 @@ define amdgpu_kernel void @global_system_acquire_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_system_acquire_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1893,7 +2384,11 @@ define amdgpu_kernel void @global_system_acquire_atomicrmw( ; ; GFX11-WGP-LABEL: global_system_acquire_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1906,7 +2401,11 @@ define amdgpu_kernel void @global_system_acquire_atomicrmw( ; ; GFX11-CU-LABEL: global_system_acquire_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1919,7 +2418,11 @@ define amdgpu_kernel void @global_system_acquire_atomicrmw( ; ; GFX12-WGP-LABEL: global_system_acquire_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -1931,7 +2434,11 @@ define amdgpu_kernel void @global_system_acquire_atomicrmw( ; ; GFX12-CU-LABEL: global_system_acquire_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -1944,7 +2451,11 @@ define amdgpu_kernel void @global_system_acquire_atomicrmw( ; GFX1250-LABEL: global_system_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -1965,6 +2476,10 @@ define amdgpu_kernel void @global_system_release_atomicrmw( ; GFX6-LABEL: global_system_release_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -1985,11 +2500,15 @@ define amdgpu_kernel void @global_system_release_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 @@ -1997,7 +2516,11 @@ define amdgpu_kernel void @global_system_release_atomicrmw( ; ; GFX10-WGP-LABEL: global_system_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2009,7 +2532,11 @@ define amdgpu_kernel void @global_system_release_atomicrmw( ; ; GFX10-CU-LABEL: global_system_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2022,6 +2549,10 @@ define amdgpu_kernel void @global_system_release_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_system_release_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -2039,7 +2570,11 @@ define amdgpu_kernel void @global_system_release_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2051,7 +2586,11 @@ define amdgpu_kernel void @global_system_release_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_system_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2063,7 +2602,11 @@ define amdgpu_kernel void @global_system_release_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_system_release_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2075,7 +2618,11 @@ define amdgpu_kernel void @global_system_release_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_system_release_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2087,7 +2634,11 @@ define amdgpu_kernel void @global_system_release_atomicrmw( ; ; GFX11-WGP-LABEL: global_system_release_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2099,7 +2650,11 @@ define amdgpu_kernel void @global_system_release_atomicrmw( ; ; GFX11-CU-LABEL: global_system_release_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2111,7 +2666,11 @@ define amdgpu_kernel void @global_system_release_atomicrmw( ; ; GFX12-WGP-LABEL: global_system_release_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -2126,7 +2685,11 @@ define amdgpu_kernel void @global_system_release_atomicrmw( ; ; GFX12-CU-LABEL: global_system_release_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -2142,7 +2705,11 @@ define amdgpu_kernel void @global_system_release_atomicrmw( ; GFX1250-LABEL: global_system_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2165,6 +2732,10 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw( ; GFX6-LABEL: global_system_acq_rel_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -2187,11 +2758,15 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 @@ -2201,7 +2776,11 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw( ; ; GFX10-WGP-LABEL: global_system_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2216,7 +2795,11 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw( ; ; GFX10-CU-LABEL: global_system_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2232,6 +2815,10 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_system_acq_rel_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -2250,7 +2837,11 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2265,7 +2856,11 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_system_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2280,7 +2875,11 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_system_acq_rel_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2294,7 +2893,11 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_system_acq_rel_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2308,7 +2911,11 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw( ; ; GFX11-WGP-LABEL: global_system_acq_rel_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2323,7 +2930,11 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw( ; ; GFX11-CU-LABEL: global_system_acq_rel_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2338,7 +2949,11 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw( ; ; GFX12-WGP-LABEL: global_system_acq_rel_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -2355,7 +2970,11 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw( ; ; GFX12-CU-LABEL: global_system_acq_rel_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -2373,7 +2992,11 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw( ; GFX1250-LABEL: global_system_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2399,6 +3022,10 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw( ; GFX6-LABEL: global_system_seq_cst_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -2421,11 +3048,15 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 @@ -2435,7 +3066,11 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw( ; ; GFX10-WGP-LABEL: global_system_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2450,7 +3085,11 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw( ; ; GFX10-CU-LABEL: global_system_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2466,6 +3105,10 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_system_seq_cst_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -2484,7 +3127,11 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2499,7 +3146,11 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2514,7 +3165,11 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_system_seq_cst_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2528,7 +3183,11 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_system_seq_cst_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2542,7 +3201,11 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw( ; ; GFX11-WGP-LABEL: global_system_seq_cst_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2557,7 +3220,11 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw( ; ; GFX11-CU-LABEL: global_system_seq_cst_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2572,7 +3239,11 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw( ; ; GFX12-WGP-LABEL: global_system_seq_cst_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -2589,7 +3260,11 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw( ; ; GFX12-CU-LABEL: global_system_seq_cst_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -2607,7 +3282,11 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw( ; GFX1250-LABEL: global_system_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2633,6 +3312,10 @@ define amdgpu_kernel void @global_system_acquire_ret_atomicrmw( ; GFX6-LABEL: global_system_acquire_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -2656,6 +3339,10 @@ define amdgpu_kernel void @global_system_acquire_ret_atomicrmw( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -2671,7 +3358,11 @@ define amdgpu_kernel void @global_system_acquire_ret_atomicrmw( ; ; GFX10-WGP-LABEL: global_system_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2685,7 +3376,11 @@ define amdgpu_kernel void @global_system_acquire_ret_atomicrmw( ; ; GFX10-CU-LABEL: global_system_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2700,6 +3395,10 @@ define amdgpu_kernel void @global_system_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_system_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -2718,7 +3417,11 @@ define amdgpu_kernel void @global_system_acquire_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2732,7 +3435,11 @@ define amdgpu_kernel void @global_system_acquire_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_system_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2746,7 +3453,11 @@ define amdgpu_kernel void @global_system_acquire_ret_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_system_acquire_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2759,7 +3470,11 @@ define amdgpu_kernel void @global_system_acquire_ret_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_system_acquire_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2772,7 +3487,11 @@ define amdgpu_kernel void @global_system_acquire_ret_atomicrmw( ; ; GFX11-WGP-LABEL: global_system_acquire_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2786,7 +3505,11 @@ define amdgpu_kernel void @global_system_acquire_ret_atomicrmw( ; ; GFX11-CU-LABEL: global_system_acquire_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2800,7 +3523,11 @@ define amdgpu_kernel void @global_system_acquire_ret_atomicrmw( ; ; GFX12-WGP-LABEL: global_system_acquire_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -2813,7 +3540,11 @@ define amdgpu_kernel void @global_system_acquire_ret_atomicrmw( ; ; GFX12-CU-LABEL: global_system_acquire_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -2827,7 +3558,11 @@ define amdgpu_kernel void @global_system_acquire_ret_atomicrmw( ; GFX1250-LABEL: global_system_acquire_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2850,6 +3585,10 @@ define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw( ; GFX6-LABEL: global_system_acq_rel_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -2874,6 +3613,10 @@ define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -2890,7 +3633,11 @@ define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw( ; ; GFX10-WGP-LABEL: global_system_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2906,7 +3653,11 @@ define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw( ; ; GFX10-CU-LABEL: global_system_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2923,6 +3674,10 @@ define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_system_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -2942,7 +3697,11 @@ define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2958,7 +3717,11 @@ define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_system_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2974,7 +3737,11 @@ define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_system_acq_rel_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2989,7 +3756,11 @@ define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_system_acq_rel_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3004,7 +3775,11 @@ define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw( ; ; GFX11-WGP-LABEL: global_system_acq_rel_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -3020,7 +3795,11 @@ define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw( ; ; GFX11-CU-LABEL: global_system_acq_rel_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -3036,7 +3815,11 @@ define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw( ; ; GFX12-WGP-LABEL: global_system_acq_rel_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -3054,7 +3837,11 @@ define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw( ; ; GFX12-CU-LABEL: global_system_acq_rel_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -3073,7 +3860,11 @@ define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw( ; GFX1250-LABEL: global_system_acq_rel_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -3101,6 +3892,10 @@ define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw( ; GFX6-LABEL: global_system_seq_cst_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -3125,6 +3920,10 @@ define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -3141,7 +3940,11 @@ define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw( ; ; GFX10-WGP-LABEL: global_system_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -3157,7 +3960,11 @@ define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw( ; ; GFX10-CU-LABEL: global_system_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -3174,6 +3981,10 @@ define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_system_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -3193,7 +4004,11 @@ define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3209,7 +4024,11 @@ define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3225,7 +4044,11 @@ define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_system_seq_cst_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3240,7 +4063,11 @@ define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_system_seq_cst_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3255,7 +4082,11 @@ define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw( ; ; GFX11-WGP-LABEL: global_system_seq_cst_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -3271,7 +4102,11 @@ define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw( ; ; GFX11-CU-LABEL: global_system_seq_cst_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -3287,7 +4122,11 @@ define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw( ; ; GFX12-WGP-LABEL: global_system_seq_cst_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -3305,7 +4144,11 @@ define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw( ; ; GFX12-CU-LABEL: global_system_seq_cst_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -3324,7 +4167,11 @@ define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw( ; GFX1250-LABEL: global_system_seq_cst_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -3353,6 +4200,12 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -3377,11 +4230,16 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -3391,6 +4249,7 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -3401,7 +4260,13 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3415,7 +4280,13 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_system_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3431,6 +4302,12 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -3451,7 +4328,13 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3465,7 +4348,13 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3479,7 +4368,13 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_system_monotonic_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -3493,7 +4388,13 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_system_monotonic_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -3507,7 +4408,13 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3521,7 +4428,13 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_system_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3535,7 +4448,13 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3549,7 +4468,13 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_system_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3564,7 +4489,13 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_cmpxchg( ; GFX1250-LABEL: global_system_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -3588,6 +4519,12 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -3614,11 +4551,16 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -3628,6 +4570,7 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -3640,7 +4583,13 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3657,7 +4606,13 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_system_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3676,6 +4631,12 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -3697,7 +4658,13 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3714,7 +4681,13 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3731,7 +4704,13 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_system_acquire_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -3747,7 +4726,13 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_system_acquire_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -3763,7 +4748,13 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3780,7 +4771,13 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_system_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3797,7 +4794,13 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3813,7 +4816,13 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_system_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3830,7 +4839,13 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg( ; GFX1250-LABEL: global_system_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -3857,6 +4872,12 @@ define amdgpu_kernel void @global_system_release_monotonic_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -3882,11 +4903,16 @@ define amdgpu_kernel void @global_system_release_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -3896,6 +4922,7 @@ define amdgpu_kernel void @global_system_release_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -3907,7 +4934,13 @@ define amdgpu_kernel void @global_system_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3923,7 +4956,13 @@ define amdgpu_kernel void @global_system_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_system_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3941,6 +4980,12 @@ define amdgpu_kernel void @global_system_release_monotonic_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -3962,7 +5007,13 @@ define amdgpu_kernel void @global_system_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3978,7 +5029,13 @@ define amdgpu_kernel void @global_system_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3994,7 +5051,13 @@ define amdgpu_kernel void @global_system_release_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_system_release_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -4010,7 +5073,13 @@ define amdgpu_kernel void @global_system_release_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_system_release_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -4026,7 +5095,13 @@ define amdgpu_kernel void @global_system_release_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4042,7 +5117,13 @@ define amdgpu_kernel void @global_system_release_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_system_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4058,7 +5139,13 @@ define amdgpu_kernel void @global_system_release_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4077,7 +5164,13 @@ define amdgpu_kernel void @global_system_release_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_system_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4097,7 +5190,13 @@ define amdgpu_kernel void @global_system_release_monotonic_cmpxchg( ; GFX1250-LABEL: global_system_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -4126,6 +5225,12 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -4153,11 +5258,16 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -4167,6 +5277,7 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -4180,7 +5291,13 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4199,7 +5316,13 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_system_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4220,6 +5343,12 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -4242,7 +5371,13 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4261,7 +5396,13 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4280,7 +5421,13 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_system_acq_rel_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -4298,7 +5445,13 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_system_acq_rel_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -4316,7 +5469,13 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4335,7 +5494,13 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_system_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4354,7 +5519,13 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4375,7 +5546,13 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_system_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4397,7 +5574,13 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg( ; GFX1250-LABEL: global_system_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -4429,6 +5612,12 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -4456,11 +5645,16 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -4470,6 +5664,7 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -4483,7 +5678,13 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4502,7 +5703,13 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_system_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4523,6 +5730,12 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -4545,7 +5758,13 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4564,7 +5783,13 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4583,7 +5808,13 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_system_seq_cst_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -4601,7 +5832,13 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_system_seq_cst_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -4619,7 +5856,13 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4638,7 +5881,13 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_system_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4657,7 +5906,13 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4678,7 +5933,13 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_system_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4700,7 +5961,13 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg( ; GFX1250-LABEL: global_system_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -4732,6 +5999,12 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -4758,11 +6031,16 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -4772,6 +6050,7 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -4784,7 +6063,13 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4801,7 +6086,13 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_system_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4820,6 +6111,12 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -4841,7 +6138,13 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4858,7 +6161,13 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4875,7 +6184,13 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_system_monotonic_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -4891,7 +6206,13 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_system_monotonic_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -4907,7 +6228,13 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4924,7 +6251,13 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_system_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4941,7 +6274,13 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4957,7 +6296,13 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_system_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4974,7 +6319,13 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg( ; GFX1250-LABEL: global_system_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -5001,6 +6352,12 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5027,11 +6384,16 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -5041,6 +6403,7 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -5053,7 +6416,13 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5070,7 +6439,13 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_system_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5089,6 +6464,12 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -5110,7 +6491,13 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5127,7 +6514,13 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5144,7 +6537,13 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_system_acquire_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -5160,7 +6559,13 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_system_acquire_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -5176,7 +6581,13 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5193,7 +6604,13 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_system_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5210,7 +6627,13 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5226,7 +6649,13 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_system_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5243,7 +6672,13 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg( ; GFX1250-LABEL: global_system_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -5270,6 +6705,12 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5297,11 +6738,16 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -5311,6 +6757,7 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -5324,7 +6771,13 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5343,7 +6796,13 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_system_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5364,6 +6823,12 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -5386,7 +6851,13 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5405,7 +6876,13 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5424,7 +6901,13 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_system_release_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -5442,7 +6925,13 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_system_release_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -5460,7 +6949,13 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5479,7 +6974,13 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_system_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5498,7 +6999,13 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5519,7 +7026,13 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_system_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5541,7 +7054,13 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg( ; GFX1250-LABEL: global_system_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -5573,6 +7092,12 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5600,11 +7125,16 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -5614,6 +7144,7 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -5627,7 +7158,13 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5646,7 +7183,13 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_system_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5667,6 +7210,12 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -5689,7 +7238,13 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5708,7 +7263,13 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5727,7 +7288,13 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_system_acq_rel_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -5745,7 +7312,13 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_system_acq_rel_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -5763,7 +7336,13 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5782,7 +7361,13 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_system_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5801,7 +7386,13 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5822,7 +7413,13 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_system_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5844,7 +7441,13 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg( ; GFX1250-LABEL: global_system_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -5876,6 +7479,12 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5903,11 +7512,16 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -5917,6 +7531,7 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -5930,7 +7545,13 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5949,7 +7570,13 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_system_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5970,6 +7597,12 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -5992,7 +7625,13 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6011,7 +7650,13 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6030,7 +7675,13 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_system_seq_cst_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -6048,7 +7699,13 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_system_seq_cst_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -6066,7 +7723,13 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6085,7 +7748,13 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_system_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6104,7 +7773,13 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6125,7 +7800,13 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_system_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6147,7 +7828,13 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg( ; GFX1250-LABEL: global_system_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -6179,6 +7866,12 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6206,11 +7899,16 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -6220,6 +7918,7 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -6233,7 +7932,13 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6252,7 +7957,13 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_system_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6273,6 +7984,12 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -6295,7 +8012,13 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6314,7 +8037,13 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6333,7 +8062,13 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_system_seq_cst_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -6351,7 +8086,13 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_system_seq_cst_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -6369,7 +8110,13 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6388,7 +8135,13 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_system_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6407,7 +8160,13 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6428,7 +8187,13 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_system_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6450,7 +8215,13 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg( ; GFX1250-LABEL: global_system_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -6482,6 +8253,12 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6510,6 +8287,12 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -6537,7 +8320,13 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6553,7 +8342,13 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_system_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6571,6 +8366,12 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -6594,7 +8395,13 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6610,7 +8417,13 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6626,7 +8439,13 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_system_monotonic_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -6642,7 +8461,13 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_system_monotonic_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -6658,7 +8483,13 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6674,7 +8505,13 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_system_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6690,7 +8527,13 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6706,7 +8549,13 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_system_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6723,7 +8572,13 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_ret_cmpxchg( ; GFX1250-LABEL: global_system_monotonic_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -6751,6 +8606,12 @@ define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6780,6 +8641,12 @@ define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -6808,7 +8675,13 @@ define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6826,7 +8699,13 @@ define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_system_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6846,6 +8725,12 @@ define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -6869,7 +8754,13 @@ define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6887,7 +8778,13 @@ define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6905,7 +8802,13 @@ define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_system_acquire_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -6922,7 +8825,13 @@ define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_system_acquire_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -6939,7 +8848,13 @@ define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6957,7 +8872,13 @@ define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_system_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6975,7 +8896,13 @@ define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6992,7 +8919,13 @@ define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_system_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7010,7 +8943,13 @@ define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg( ; GFX1250-LABEL: global_system_acquire_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -7040,6 +8979,12 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7070,6 +9015,12 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -7099,7 +9050,13 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7119,7 +9076,13 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7141,6 +9104,12 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -7165,7 +9134,13 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7185,7 +9160,13 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7205,7 +9186,13 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -7224,7 +9211,13 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -7243,7 +9236,13 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7263,7 +9262,13 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7283,7 +9288,13 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7305,7 +9316,13 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7328,7 +9345,13 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg( ; GFX1250-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -7363,6 +9386,12 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7393,6 +9422,12 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -7422,7 +9457,13 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7442,7 +9483,13 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7464,6 +9511,12 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -7488,7 +9541,13 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7508,7 +9567,13 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7528,7 +9593,13 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -7547,7 +9618,13 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -7566,7 +9643,13 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7586,7 +9669,13 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7606,7 +9695,13 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7628,7 +9723,13 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7651,7 +9752,13 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg( ; GFX1250-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -7686,6 +9793,12 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7715,6 +9828,12 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -7743,7 +9862,13 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7761,7 +9886,13 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_system_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7781,6 +9912,12 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -7804,7 +9941,13 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7822,7 +9965,13 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7840,7 +9989,13 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_system_monotonic_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -7857,7 +10012,13 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_system_monotonic_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -7874,7 +10035,13 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7892,7 +10059,13 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_system_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7910,7 +10083,13 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7927,7 +10106,13 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_system_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7945,7 +10130,13 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg( ; GFX1250-LABEL: global_system_monotonic_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -7975,6 +10166,12 @@ define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -8004,6 +10201,12 @@ define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -8032,7 +10235,13 @@ define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8050,7 +10259,13 @@ define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_system_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8070,6 +10285,12 @@ define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -8093,7 +10314,13 @@ define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8111,7 +10338,13 @@ define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8129,7 +10362,13 @@ define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_system_acquire_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -8146,7 +10385,13 @@ define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_system_acquire_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -8163,7 +10408,13 @@ define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8181,7 +10432,13 @@ define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_system_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8199,7 +10456,13 @@ define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8216,7 +10479,13 @@ define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_system_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8234,7 +10503,13 @@ define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg( ; GFX1250-LABEL: global_system_acquire_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -8264,6 +10539,12 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -8294,6 +10575,12 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -8323,7 +10610,13 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8343,7 +10636,13 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_system_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8365,6 +10664,12 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -8389,7 +10694,13 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8409,7 +10720,13 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8429,7 +10746,13 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_system_release_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -8448,7 +10771,13 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_system_release_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -8467,7 +10796,13 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8487,7 +10822,13 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_system_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8507,7 +10848,13 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8529,7 +10876,13 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_system_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8552,7 +10905,13 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg( ; GFX1250-LABEL: global_system_release_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -8587,6 +10946,12 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -8617,6 +10982,12 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -8646,7 +11017,13 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8666,7 +11043,13 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_system_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8688,6 +11071,12 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -8712,7 +11101,13 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8732,7 +11127,13 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8752,7 +11153,13 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_system_acq_rel_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -8771,7 +11178,13 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_system_acq_rel_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -8790,7 +11203,13 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8810,7 +11229,13 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_system_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8830,7 +11255,13 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8852,7 +11283,13 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_system_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8875,7 +11312,13 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg( ; GFX1250-LABEL: global_system_acq_rel_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -8910,6 +11353,12 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -8940,6 +11389,12 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -8969,7 +11424,13 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8989,7 +11450,13 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_system_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9011,6 +11478,12 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -9035,7 +11508,13 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9055,7 +11534,13 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9075,7 +11560,13 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_system_seq_cst_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -9094,7 +11585,13 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_system_seq_cst_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -9113,7 +11610,13 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9133,7 +11636,13 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_system_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9153,7 +11662,13 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9175,7 +11690,13 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_system_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9198,7 +11719,13 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg( ; GFX1250-LABEL: global_system_seq_cst_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -9233,6 +11760,12 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -9263,6 +11796,12 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -9292,7 +11831,13 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9312,7 +11857,13 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9334,6 +11885,12 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -9358,7 +11915,13 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9378,7 +11941,13 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9398,7 +11967,13 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -9417,7 +11992,13 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -9436,7 +12017,13 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9456,7 +12043,13 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9476,7 +12069,13 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9498,7 +12097,13 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9521,7 +12126,13 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -9556,6 +12167,12 @@ define amdgpu_kernel void @global_system_acquire_seq_cst_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -9586,6 +12203,12 @@ define amdgpu_kernel void @global_system_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -9615,7 +12238,13 @@ define amdgpu_kernel void @global_system_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9635,7 +12264,13 @@ define amdgpu_kernel void @global_system_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_system_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9657,6 +12292,12 @@ define amdgpu_kernel void @global_system_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -9681,7 +12322,13 @@ define amdgpu_kernel void @global_system_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9701,7 +12348,13 @@ define amdgpu_kernel void @global_system_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9721,7 +12374,13 @@ define amdgpu_kernel void @global_system_acquire_seq_cst_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_system_acquire_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -9740,7 +12399,13 @@ define amdgpu_kernel void @global_system_acquire_seq_cst_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_system_acquire_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -9759,7 +12424,13 @@ define amdgpu_kernel void @global_system_acquire_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9779,7 +12450,13 @@ define amdgpu_kernel void @global_system_acquire_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_system_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9799,7 +12476,13 @@ define amdgpu_kernel void @global_system_acquire_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9821,7 +12504,13 @@ define amdgpu_kernel void @global_system_acquire_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_system_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9844,7 +12533,13 @@ define amdgpu_kernel void @global_system_acquire_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: global_system_acquire_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -9879,6 +12574,12 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -9909,6 +12610,12 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -9938,7 +12645,13 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_relese_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9958,7 +12671,13 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_system_relese_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9980,6 +12699,12 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -10004,7 +12729,13 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_relese_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10024,7 +12755,13 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_relese_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10044,7 +12781,13 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_system_relese_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -10063,7 +12806,13 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_system_relese_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -10082,7 +12831,13 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_relese_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10102,7 +12857,13 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_system_relese_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10122,7 +12883,13 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_relese_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10144,7 +12911,13 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_system_relese_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10167,7 +12940,13 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: global_system_relese_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -10202,6 +12981,12 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -10232,6 +13017,12 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -10261,7 +13052,13 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10281,7 +13078,13 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10303,6 +13106,12 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -10327,7 +13136,13 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10347,7 +13162,13 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10367,7 +13188,13 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -10386,7 +13213,13 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -10405,7 +13238,13 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10425,7 +13264,13 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10445,7 +13290,13 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10467,7 +13318,13 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10490,7 +13347,13 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -10525,6 +13388,12 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -10555,6 +13424,12 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -10584,7 +13459,13 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10604,7 +13485,13 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10626,6 +13513,12 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -10650,7 +13543,13 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10670,7 +13569,13 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10690,7 +13595,13 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -10709,7 +13620,13 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -10728,7 +13645,13 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10748,7 +13671,13 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10768,7 +13697,13 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10790,7 +13725,13 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10813,7 +13754,13 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -10847,6 +13794,9 @@ define amdgpu_kernel void @global_system_one_as_unordered_load( ; GFX6-LABEL: global_system_one_as_unordered_load: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -10874,12 +13824,16 @@ define amdgpu_kernel void @global_system_one_as_unordered_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -10888,29 +13842,38 @@ define amdgpu_kernel void @global_system_one_as_unordered_load( ; ; GFX10-WGP-LABEL: global_system_one_as_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_one_as_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_unordered_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -10935,101 +13898,131 @@ define amdgpu_kernel void @global_system_one_as_unordered_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_unordered_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_system_one_as_unordered_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_system_one_as_unordered_load: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_system_one_as_unordered_load: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_system_one_as_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_system_one_as_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_system_one_as_unordered_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -11043,6 +14036,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_load( ; GFX6-LABEL: global_system_one_as_monotonic_load: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -11070,12 +14066,16 @@ define amdgpu_kernel void @global_system_one_as_monotonic_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] glc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -11084,29 +14084,38 @@ define amdgpu_kernel void @global_system_one_as_monotonic_load( ; ; GFX10-WGP-LABEL: global_system_one_as_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] glc dlc -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_one_as_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] glc dlc -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_monotonic_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -11131,101 +14140,131 @@ define amdgpu_kernel void @global_system_one_as_monotonic_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc0 sc1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_system_one_as_monotonic_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc0 sc1 -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_system_one_as_monotonic_load: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] glc -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_system_one_as_monotonic_load: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] glc -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_system_one_as_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_system_one_as_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_system_one_as_monotonic_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -11239,6 +14278,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_load( ; GFX6-LABEL: global_system_one_as_acquire_load: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -11267,14 +14309,18 @@ define amdgpu_kernel void @global_system_one_as_acquire_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: flat_store_dword v[0:1], v2 @@ -11282,33 +14328,44 @@ define amdgpu_kernel void @global_system_one_as_acquire_load( ; ; GFX10-WGP-LABEL: global_system_one_as_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_one_as_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -11333,115 +14390,151 @@ define amdgpu_kernel void @global_system_one_as_acquire_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_acquire_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc0 sc1 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_system_one_as_acquire_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc0 sc1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_system_one_as_acquire_load: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_system_one_as_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_system_one_as_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_system_one_as_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_system_one_as_acquire_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -11455,6 +14548,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_load( ; GFX6-LABEL: global_system_one_as_seq_cst_load: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -11484,15 +14580,19 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_load_dword v2, v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: flat_store_dword v[0:1], v2 @@ -11500,35 +14600,48 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_load( ; ; GFX10-WGP-LABEL: global_system_one_as_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_one_as_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -11554,127 +14667,169 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc0 sc1 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_system_one_as_seq_cst_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc0 sc1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_system_one_as_seq_cst_load: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_system_one_as_seq_cst_load: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_system_one_as_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_system_one_as_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_system_one_as_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -11688,6 +14843,9 @@ define amdgpu_kernel void @global_system_one_as_unordered_store( ; GFX6-LABEL: global_system_one_as_unordered_store: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -11709,6 +14867,10 @@ define amdgpu_kernel void @global_system_one_as_unordered_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -11719,27 +14881,38 @@ define amdgpu_kernel void @global_system_one_as_unordered_store( ; ; GFX10-WGP-LABEL: global_system_one_as_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_one_as_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_unordered_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -11757,93 +14930,129 @@ define amdgpu_kernel void @global_system_one_as_unordered_store( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_unordered_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_system_one_as_unordered_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_system_one_as_unordered_store: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_system_one_as_unordered_store: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_system_one_as_unordered_store: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_system_one_as_unordered_store: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_system_one_as_unordered_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -11856,6 +15065,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_store( ; GFX6-LABEL: global_system_one_as_monotonic_store: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -11877,6 +15089,10 @@ define amdgpu_kernel void @global_system_one_as_monotonic_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -11887,27 +15103,38 @@ define amdgpu_kernel void @global_system_one_as_monotonic_store( ; ; GFX10-WGP-LABEL: global_system_one_as_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_one_as_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_monotonic_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -11925,93 +15152,129 @@ define amdgpu_kernel void @global_system_one_as_monotonic_store( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_system_one_as_monotonic_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_system_one_as_monotonic_store: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_system_one_as_monotonic_store: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_system_one_as_monotonic_store: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_system_one_as_monotonic_store: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_system_one_as_monotonic_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -12024,6 +15287,9 @@ define amdgpu_kernel void @global_system_one_as_release_store( ; GFX6-LABEL: global_system_one_as_release_store: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -12046,6 +15312,10 @@ define amdgpu_kernel void @global_system_one_as_release_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -12057,24 +15327,30 @@ define amdgpu_kernel void @global_system_one_as_release_store( ; ; GFX10-WGP-LABEL: global_system_one_as_release_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_one_as_release_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm @@ -12082,6 +15358,9 @@ define amdgpu_kernel void @global_system_one_as_release_store( ; SKIP-CACHE-INV-LABEL: global_system_one_as_release_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -12100,113 +15379,142 @@ define amdgpu_kernel void @global_system_one_as_release_store( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_release_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_system_one_as_release_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_system_one_as_release_store: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_system_one_as_release_store: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_system_one_as_release_store: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_system_one_as_release_store: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_system_one_as_release_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -12214,6 +15522,7 @@ define amdgpu_kernel void @global_system_one_as_release_store( ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -12226,6 +15535,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_store( ; GFX6-LABEL: global_system_one_as_seq_cst_store: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -12248,6 +15560,10 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -12259,24 +15575,30 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_store( ; ; GFX10-WGP-LABEL: global_system_one_as_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_one_as_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm @@ -12284,6 +15606,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_store( ; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -12302,113 +15627,142 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_store( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_system_one_as_seq_cst_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_system_one_as_seq_cst_store: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_system_one_as_seq_cst_store: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_system_one_as_seq_cst_store: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_system_one_as_seq_cst_store: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_system_one_as_seq_cst_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -12416,6 +15770,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_store( ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -12428,6 +15783,10 @@ define amdgpu_kernel void @global_system_one_as_monotonic_atomicrmw( ; GFX6-LABEL: global_system_one_as_monotonic_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -12447,18 +15806,26 @@ define amdgpu_kernel void @global_system_one_as_monotonic_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_system_one_as_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12468,7 +15835,11 @@ define amdgpu_kernel void @global_system_one_as_monotonic_atomicrmw( ; ; GFX10-CU-LABEL: global_system_one_as_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12479,6 +15850,10 @@ define amdgpu_kernel void @global_system_one_as_monotonic_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_system_one_as_monotonic_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -12495,7 +15870,11 @@ define amdgpu_kernel void @global_system_one_as_monotonic_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12505,7 +15884,11 @@ define amdgpu_kernel void @global_system_one_as_monotonic_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12515,7 +15898,11 @@ define amdgpu_kernel void @global_system_one_as_monotonic_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12525,7 +15912,11 @@ define amdgpu_kernel void @global_system_one_as_monotonic_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_system_one_as_monotonic_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12535,7 +15926,11 @@ define amdgpu_kernel void @global_system_one_as_monotonic_atomicrmw( ; ; GFX11-WGP-LABEL: global_system_one_as_monotonic_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12545,7 +15940,11 @@ define amdgpu_kernel void @global_system_one_as_monotonic_atomicrmw( ; ; GFX11-CU-LABEL: global_system_one_as_monotonic_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12555,7 +15954,11 @@ define amdgpu_kernel void @global_system_one_as_monotonic_atomicrmw( ; ; GFX12-WGP-LABEL: global_system_one_as_monotonic_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -12565,7 +15968,11 @@ define amdgpu_kernel void @global_system_one_as_monotonic_atomicrmw( ; ; GFX12-CU-LABEL: global_system_one_as_monotonic_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -12576,7 +15983,11 @@ define amdgpu_kernel void @global_system_one_as_monotonic_atomicrmw( ; GFX1250-LABEL: global_system_one_as_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -12594,6 +16005,10 @@ define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw( ; GFX6-LABEL: global_system_one_as_acquire_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -12615,11 +16030,15 @@ define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -12628,7 +16047,11 @@ define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw( ; ; GFX10-WGP-LABEL: global_system_one_as_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12641,7 +16064,11 @@ define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw( ; ; GFX10-CU-LABEL: global_system_one_as_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12655,6 +16082,10 @@ define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -12672,7 +16103,11 @@ define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12685,7 +16120,11 @@ define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12698,7 +16137,11 @@ define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_acquire_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12710,7 +16153,11 @@ define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_system_one_as_acquire_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12722,7 +16169,11 @@ define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw( ; ; GFX11-WGP-LABEL: global_system_one_as_acquire_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12735,7 +16186,11 @@ define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw( ; ; GFX11-CU-LABEL: global_system_one_as_acquire_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12748,7 +16203,11 @@ define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw( ; ; GFX12-WGP-LABEL: global_system_one_as_acquire_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -12760,7 +16219,11 @@ define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw( ; ; GFX12-CU-LABEL: global_system_one_as_acquire_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -12773,7 +16236,11 @@ define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw( ; GFX1250-LABEL: global_system_one_as_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -12794,6 +16261,10 @@ define amdgpu_kernel void @global_system_one_as_release_atomicrmw( ; GFX6-LABEL: global_system_one_as_release_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -12814,11 +16285,15 @@ define amdgpu_kernel void @global_system_one_as_release_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 @@ -12826,7 +16301,11 @@ define amdgpu_kernel void @global_system_one_as_release_atomicrmw( ; ; GFX10-WGP-LABEL: global_system_one_as_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12838,7 +16317,11 @@ define amdgpu_kernel void @global_system_one_as_release_atomicrmw( ; ; GFX10-CU-LABEL: global_system_one_as_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12851,6 +16334,10 @@ define amdgpu_kernel void @global_system_one_as_release_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_system_one_as_release_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -12868,7 +16355,11 @@ define amdgpu_kernel void @global_system_one_as_release_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12880,7 +16371,11 @@ define amdgpu_kernel void @global_system_one_as_release_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12892,7 +16387,11 @@ define amdgpu_kernel void @global_system_one_as_release_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_release_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12904,7 +16403,11 @@ define amdgpu_kernel void @global_system_one_as_release_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_system_one_as_release_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12916,7 +16419,11 @@ define amdgpu_kernel void @global_system_one_as_release_atomicrmw( ; ; GFX11-WGP-LABEL: global_system_one_as_release_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12928,7 +16435,11 @@ define amdgpu_kernel void @global_system_one_as_release_atomicrmw( ; ; GFX11-CU-LABEL: global_system_one_as_release_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12940,7 +16451,11 @@ define amdgpu_kernel void @global_system_one_as_release_atomicrmw( ; ; GFX12-WGP-LABEL: global_system_one_as_release_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -12955,7 +16470,11 @@ define amdgpu_kernel void @global_system_one_as_release_atomicrmw( ; ; GFX12-CU-LABEL: global_system_one_as_release_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -12971,7 +16490,11 @@ define amdgpu_kernel void @global_system_one_as_release_atomicrmw( ; GFX1250-LABEL: global_system_one_as_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -12994,6 +16517,10 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw( ; GFX6-LABEL: global_system_one_as_acq_rel_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -13016,11 +16543,15 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 @@ -13030,7 +16561,11 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw( ; ; GFX10-WGP-LABEL: global_system_one_as_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -13045,7 +16580,11 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw( ; ; GFX10-CU-LABEL: global_system_one_as_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -13061,6 +16600,10 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_system_one_as_acq_rel_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -13079,7 +16622,11 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13094,7 +16641,11 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13109,7 +16660,11 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13123,7 +16678,11 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_system_one_as_acq_rel_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13137,7 +16696,11 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw( ; ; GFX11-WGP-LABEL: global_system_one_as_acq_rel_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -13152,7 +16715,11 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw( ; ; GFX11-CU-LABEL: global_system_one_as_acq_rel_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -13167,7 +16734,11 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw( ; ; GFX12-WGP-LABEL: global_system_one_as_acq_rel_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -13184,7 +16755,11 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw( ; ; GFX12-CU-LABEL: global_system_one_as_acq_rel_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -13202,7 +16777,11 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw( ; GFX1250-LABEL: global_system_one_as_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -13228,6 +16807,10 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw( ; GFX6-LABEL: global_system_one_as_seq_cst_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -13250,11 +16833,15 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 @@ -13264,7 +16851,11 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw( ; ; GFX10-WGP-LABEL: global_system_one_as_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -13279,7 +16870,11 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw( ; ; GFX10-CU-LABEL: global_system_one_as_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -13295,6 +16890,10 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -13313,7 +16912,11 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13328,7 +16931,11 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13343,7 +16950,11 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13357,7 +16968,11 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_system_one_as_seq_cst_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13371,7 +16986,11 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw( ; ; GFX11-WGP-LABEL: global_system_one_as_seq_cst_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -13386,7 +17005,11 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw( ; ; GFX11-CU-LABEL: global_system_one_as_seq_cst_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -13401,7 +17024,11 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw( ; ; GFX12-WGP-LABEL: global_system_one_as_seq_cst_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -13418,7 +17045,11 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw( ; ; GFX12-CU-LABEL: global_system_one_as_seq_cst_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -13436,7 +17067,11 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw( ; GFX1250-LABEL: global_system_one_as_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -13462,6 +17097,10 @@ define amdgpu_kernel void @global_system_one_as_acquire_ret_atomicrmw( ; GFX6-LABEL: global_system_one_as_acquire_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -13485,6 +17124,10 @@ define amdgpu_kernel void @global_system_one_as_acquire_ret_atomicrmw( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -13500,7 +17143,11 @@ define amdgpu_kernel void @global_system_one_as_acquire_ret_atomicrmw( ; ; GFX10-WGP-LABEL: global_system_one_as_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -13514,7 +17161,11 @@ define amdgpu_kernel void @global_system_one_as_acquire_ret_atomicrmw( ; ; GFX10-CU-LABEL: global_system_one_as_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -13529,6 +17180,10 @@ define amdgpu_kernel void @global_system_one_as_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -13547,7 +17202,11 @@ define amdgpu_kernel void @global_system_one_as_acquire_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13561,7 +17220,11 @@ define amdgpu_kernel void @global_system_one_as_acquire_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13575,7 +17238,11 @@ define amdgpu_kernel void @global_system_one_as_acquire_ret_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_acquire_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13588,7 +17255,11 @@ define amdgpu_kernel void @global_system_one_as_acquire_ret_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_system_one_as_acquire_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13601,7 +17272,11 @@ define amdgpu_kernel void @global_system_one_as_acquire_ret_atomicrmw( ; ; GFX11-WGP-LABEL: global_system_one_as_acquire_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -13615,7 +17290,11 @@ define amdgpu_kernel void @global_system_one_as_acquire_ret_atomicrmw( ; ; GFX11-CU-LABEL: global_system_one_as_acquire_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -13629,7 +17308,11 @@ define amdgpu_kernel void @global_system_one_as_acquire_ret_atomicrmw( ; ; GFX12-WGP-LABEL: global_system_one_as_acquire_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -13642,7 +17325,11 @@ define amdgpu_kernel void @global_system_one_as_acquire_ret_atomicrmw( ; ; GFX12-CU-LABEL: global_system_one_as_acquire_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -13656,7 +17343,11 @@ define amdgpu_kernel void @global_system_one_as_acquire_ret_atomicrmw( ; GFX1250-LABEL: global_system_one_as_acquire_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -13679,6 +17370,10 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw( ; GFX6-LABEL: global_system_one_as_acq_rel_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -13703,6 +17398,10 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -13719,7 +17418,11 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw( ; ; GFX10-WGP-LABEL: global_system_one_as_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -13735,7 +17438,11 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw( ; ; GFX10-CU-LABEL: global_system_one_as_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -13752,6 +17459,10 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_system_one_as_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -13771,7 +17482,11 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13787,7 +17502,11 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13803,7 +17522,11 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13818,7 +17541,11 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_system_one_as_acq_rel_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13833,7 +17560,11 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw( ; ; GFX11-WGP-LABEL: global_system_one_as_acq_rel_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -13849,7 +17580,11 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw( ; ; GFX11-CU-LABEL: global_system_one_as_acq_rel_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -13865,7 +17600,11 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw( ; ; GFX12-WGP-LABEL: global_system_one_as_acq_rel_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -13883,7 +17622,11 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw( ; ; GFX12-CU-LABEL: global_system_one_as_acq_rel_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -13902,7 +17645,11 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw( ; GFX1250-LABEL: global_system_one_as_acq_rel_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -13930,6 +17677,10 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw( ; GFX6-LABEL: global_system_one_as_seq_cst_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -13954,6 +17705,10 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -13970,7 +17725,11 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw( ; ; GFX10-WGP-LABEL: global_system_one_as_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -13986,7 +17745,11 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw( ; ; GFX10-CU-LABEL: global_system_one_as_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -14003,6 +17766,10 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -14022,7 +17789,11 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14038,7 +17809,11 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14054,7 +17829,11 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14069,7 +17848,11 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_system_one_as_seq_cst_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14084,7 +17867,11 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw( ; ; GFX11-WGP-LABEL: global_system_one_as_seq_cst_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -14100,7 +17887,11 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw( ; ; GFX11-CU-LABEL: global_system_one_as_seq_cst_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -14116,7 +17907,11 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw( ; ; GFX12-WGP-LABEL: global_system_one_as_seq_cst_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -14134,7 +17929,11 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw( ; ; GFX12-CU-LABEL: global_system_one_as_seq_cst_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -14153,7 +17952,11 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw( ; GFX1250-LABEL: global_system_one_as_seq_cst_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -14182,6 +17985,12 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -14206,11 +18015,16 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -14220,6 +18034,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -14230,7 +18045,13 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14244,7 +18065,13 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14260,6 +18087,12 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -14280,7 +18113,13 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14294,7 +18133,13 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14308,7 +18153,13 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -14322,7 +18173,13 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -14336,7 +18193,13 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14350,7 +18213,13 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14364,7 +18233,13 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14378,7 +18253,13 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14393,7 +18274,13 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_cmpxchg( ; GFX1250-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -14417,6 +18304,12 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -14443,11 +18336,16 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -14457,6 +18355,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -14469,7 +18368,13 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_one_as_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14486,7 +18391,13 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_system_one_as_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14505,6 +18416,12 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -14526,7 +18443,13 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14543,7 +18466,13 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14560,7 +18489,13 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_acquire_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -14576,7 +18511,13 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_system_one_as_acquire_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -14592,7 +18533,13 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_one_as_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14609,7 +18556,13 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_system_one_as_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14626,7 +18579,13 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_one_as_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14642,7 +18601,13 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_system_one_as_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14659,7 +18624,13 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg( ; GFX1250-LABEL: global_system_one_as_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -14686,6 +18657,12 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -14711,11 +18688,16 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -14725,6 +18707,7 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -14736,7 +18719,13 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_one_as_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14752,7 +18741,13 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_system_one_as_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14770,6 +18765,12 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -14791,7 +18792,13 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14807,7 +18814,13 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14823,7 +18836,13 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_release_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -14839,7 +18858,13 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_system_one_as_release_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -14855,7 +18880,13 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_one_as_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14871,7 +18902,13 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_system_one_as_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14887,7 +18924,13 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_one_as_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14906,7 +18949,13 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_system_one_as_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14926,7 +18975,13 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_cmpxchg( ; GFX1250-LABEL: global_system_one_as_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -14955,6 +19010,12 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -14982,11 +19043,16 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -14996,6 +19062,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -15009,7 +19076,13 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15028,7 +19101,13 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15049,6 +19128,12 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -15071,7 +19156,13 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15090,7 +19181,13 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15109,7 +19206,13 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -15127,7 +19230,13 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -15145,7 +19254,13 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15164,7 +19279,13 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15183,7 +19304,13 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15204,7 +19331,13 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15226,7 +19359,13 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX1250-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -15258,6 +19397,12 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15285,11 +19430,16 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -15299,6 +19449,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -15312,7 +19463,13 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15331,7 +19488,13 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15352,6 +19515,12 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -15374,7 +19543,13 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15393,7 +19568,13 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15412,7 +19593,13 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -15430,7 +19617,13 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -15448,7 +19641,13 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15467,7 +19666,13 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15486,7 +19691,13 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15507,7 +19718,13 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15529,7 +19746,13 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX1250-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -15561,6 +19784,12 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15587,11 +19816,16 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -15601,6 +19835,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -15613,7 +19848,13 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_one_as_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15630,7 +19871,13 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_system_one_as_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15649,6 +19896,12 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -15670,7 +19923,13 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15687,7 +19946,13 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15704,7 +19969,13 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -15720,7 +19991,13 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_system_one_as_monotonic_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -15736,7 +20013,13 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_one_as_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15753,7 +20036,13 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_system_one_as_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15770,7 +20059,13 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_one_as_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15786,7 +20081,13 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_system_one_as_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15803,7 +20104,13 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg( ; GFX1250-LABEL: global_system_one_as_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -15830,6 +20137,12 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15856,11 +20169,16 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -15870,6 +20188,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -15882,7 +20201,13 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_one_as_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15899,7 +20224,13 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_system_one_as_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15918,6 +20249,12 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -15939,7 +20276,13 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15956,7 +20299,13 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15973,7 +20322,13 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_acquire_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -15989,7 +20344,13 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_system_one_as_acquire_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -16005,7 +20366,13 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_one_as_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16022,7 +20389,13 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_system_one_as_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16039,7 +20412,13 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_one_as_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16055,7 +20434,13 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_system_one_as_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16072,7 +20457,13 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg( ; GFX1250-LABEL: global_system_one_as_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -16099,6 +20490,12 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -16126,11 +20523,16 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -16140,6 +20542,7 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -16153,7 +20556,13 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_one_as_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16172,7 +20581,13 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_system_one_as_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16193,6 +20608,12 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16215,7 +20636,13 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16234,7 +20661,13 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16253,7 +20686,13 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_release_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -16271,7 +20710,13 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_system_one_as_release_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -16289,7 +20734,13 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_one_as_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16308,7 +20759,13 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_system_one_as_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16327,7 +20784,13 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_one_as_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16348,7 +20811,13 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_system_one_as_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16370,7 +20839,13 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg( ; GFX1250-LABEL: global_system_one_as_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -16402,6 +20877,12 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -16429,11 +20910,16 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -16443,6 +20929,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -16456,7 +20943,13 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16475,7 +20968,13 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16496,6 +20995,12 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16518,7 +21023,13 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16537,7 +21048,13 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16556,7 +21073,13 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -16574,7 +21097,13 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -16592,7 +21121,13 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16611,7 +21146,13 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16630,7 +21171,13 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16651,7 +21198,13 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16673,7 +21226,13 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg( ; GFX1250-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -16705,6 +21264,12 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -16732,11 +21297,16 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -16746,6 +21316,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -16759,7 +21330,13 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16778,7 +21355,13 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16799,6 +21382,12 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16821,7 +21410,13 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16840,7 +21435,13 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16859,7 +21460,13 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -16877,7 +21484,13 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -16895,7 +21508,13 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16914,7 +21533,13 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16933,7 +21558,13 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16954,7 +21585,13 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16976,7 +21613,13 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg( ; GFX1250-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -17008,6 +21651,12 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -17035,11 +21684,16 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -17049,6 +21703,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -17062,7 +21717,13 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17081,7 +21742,13 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17102,6 +21769,12 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -17124,7 +21797,13 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17143,7 +21822,13 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17162,7 +21847,13 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -17180,7 +21871,13 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -17198,7 +21895,13 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17217,7 +21920,13 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17236,7 +21945,13 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17257,7 +21972,13 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17279,7 +22000,13 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX1250-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -17311,6 +22038,12 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -17338,11 +22071,16 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -17352,6 +22090,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -17365,7 +22104,13 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17384,7 +22129,13 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17405,6 +22156,12 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -17427,7 +22184,13 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17446,7 +22209,13 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17465,7 +22234,13 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -17483,7 +22258,13 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -17501,7 +22282,13 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17520,7 +22307,13 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17539,7 +22332,13 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17560,7 +22359,13 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17582,7 +22387,13 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg( ; GFX1250-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -17614,6 +22425,12 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -17641,11 +22458,16 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -17655,6 +22477,7 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -17668,7 +22491,13 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_one_as_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17687,7 +22516,13 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_system_one_as_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17708,6 +22543,12 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -17730,7 +22571,13 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17749,7 +22596,13 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17768,7 +22621,13 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_release_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -17786,7 +22645,13 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_system_one_as_release_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -17804,7 +22669,13 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_one_as_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17823,7 +22694,13 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_system_one_as_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17842,7 +22719,13 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_one_as_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17863,7 +22746,13 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_system_one_as_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17885,7 +22774,13 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg( ; GFX1250-LABEL: global_system_one_as_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -17917,6 +22812,12 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -17944,11 +22845,16 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -17958,6 +22864,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -17971,7 +22878,13 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17990,7 +22903,13 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18011,6 +22930,12 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -18033,7 +22958,13 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18052,7 +22983,13 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18071,7 +23008,13 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -18089,7 +23032,13 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -18107,7 +23056,13 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18126,7 +23081,13 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18145,7 +23106,13 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18166,7 +23133,13 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18188,7 +23161,13 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX1250-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -18220,6 +23199,12 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -18247,11 +23232,16 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -18261,6 +23251,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -18274,7 +23265,13 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18293,7 +23290,13 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18314,6 +23317,12 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -18336,7 +23345,13 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18355,7 +23370,13 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18374,7 +23395,13 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -18392,7 +23419,13 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -18410,7 +23443,13 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18429,7 +23468,13 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18448,7 +23493,13 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18469,7 +23520,13 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18491,7 +23548,13 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX1250-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -18523,6 +23586,12 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -18551,6 +23620,12 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -18578,7 +23653,13 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18594,7 +23675,13 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18612,6 +23699,12 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -18635,7 +23728,13 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18651,7 +23750,13 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18667,7 +23772,13 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -18683,7 +23794,13 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -18699,7 +23816,13 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18715,7 +23838,13 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18731,7 +23860,13 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18747,7 +23882,13 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18764,7 +23905,13 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX1250-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -18792,6 +23939,12 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -18821,6 +23974,12 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -18849,7 +24008,13 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18867,7 +24032,13 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18887,6 +24058,12 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -18910,7 +24087,13 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18928,7 +24111,13 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18946,7 +24135,13 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -18963,7 +24158,13 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -18980,7 +24181,13 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18998,7 +24205,13 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19016,7 +24229,13 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19033,7 +24252,13 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19051,7 +24276,13 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX1250-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -19081,6 +24312,12 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -19110,6 +24347,12 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -19138,7 +24381,13 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19156,7 +24405,13 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19176,6 +24431,12 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -19200,7 +24461,13 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19218,7 +24485,13 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19236,7 +24509,13 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -19254,7 +24533,13 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -19272,7 +24557,13 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19290,7 +24581,13 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19308,7 +24605,13 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19329,7 +24632,13 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19351,7 +24660,13 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_ret_cmpxchg( ; GFX1250-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -19384,6 +24699,12 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -19414,6 +24735,12 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -19443,7 +24770,13 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19463,7 +24796,13 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19485,6 +24824,12 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -19509,7 +24854,13 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19529,7 +24880,13 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19549,7 +24906,13 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -19568,7 +24931,13 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -19587,7 +24956,13 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19607,7 +24982,13 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19627,7 +25008,13 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19649,7 +25036,13 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19672,7 +25065,13 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX1250-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -19707,6 +25106,12 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -19737,6 +25142,12 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -19766,7 +25177,13 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19786,7 +25203,13 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19808,6 +25231,12 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -19832,7 +25261,13 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19852,7 +25287,13 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19872,7 +25313,13 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -19891,7 +25338,13 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -19910,7 +25363,13 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19930,7 +25389,13 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19950,7 +25415,13 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19972,7 +25443,13 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19995,7 +25472,13 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX1250-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -20030,6 +25513,12 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -20059,6 +25548,12 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -20087,7 +25582,13 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20105,7 +25606,13 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20125,6 +25632,12 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -20148,7 +25661,13 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20166,7 +25685,13 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20184,7 +25709,13 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -20201,7 +25732,13 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -20218,7 +25755,13 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20236,7 +25779,13 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20254,7 +25803,13 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20271,7 +25826,13 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20289,7 +25850,13 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX1250-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -20319,6 +25886,12 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -20348,6 +25921,12 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -20376,7 +25955,13 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20394,7 +25979,13 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20414,6 +26005,12 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -20437,7 +26034,13 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20455,7 +26058,13 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20473,7 +26082,13 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -20490,7 +26105,13 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -20507,7 +26128,13 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20525,7 +26152,13 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20543,7 +26176,13 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20560,7 +26199,13 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20578,7 +26223,13 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX1250-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -20608,6 +26259,12 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -20638,6 +26295,12 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -20667,7 +26330,13 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_one_as_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20687,7 +26356,13 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_system_one_as_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20709,6 +26384,12 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -20733,7 +26414,13 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20753,7 +26440,13 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20773,7 +26466,13 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_release_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -20792,7 +26491,13 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_system_one_as_release_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -20811,7 +26516,13 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_one_as_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20831,7 +26542,13 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_system_one_as_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20851,7 +26568,13 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_one_as_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20873,7 +26596,13 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_system_one_as_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20896,7 +26625,13 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg( ; GFX1250-LABEL: global_system_one_as_release_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -20931,6 +26666,12 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -20961,6 +26702,12 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -20990,7 +26737,13 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21010,7 +26763,13 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21032,6 +26791,12 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -21056,7 +26821,13 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21076,7 +26847,13 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21096,7 +26873,13 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -21115,7 +26898,13 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -21134,7 +26923,13 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21154,7 +26949,13 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21174,7 +26975,13 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21196,7 +27003,13 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21219,7 +27032,13 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX1250-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -21254,6 +27073,12 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -21284,6 +27109,12 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -21313,7 +27144,13 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21333,7 +27170,13 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21355,6 +27198,12 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -21379,7 +27228,13 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21399,7 +27254,13 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21419,7 +27280,13 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -21438,7 +27305,13 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -21457,7 +27330,13 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21477,7 +27356,13 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21497,7 +27382,13 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21519,7 +27410,13 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21542,7 +27439,13 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX1250-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -21577,6 +27480,12 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -21607,6 +27516,12 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -21636,7 +27551,13 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21656,7 +27577,13 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21678,6 +27605,12 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -21702,7 +27635,13 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21722,7 +27661,13 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21742,7 +27687,13 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -21761,7 +27712,13 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -21780,7 +27737,13 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21800,7 +27763,13 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21820,7 +27789,13 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21842,7 +27817,13 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21865,7 +27846,13 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -21900,6 +27887,12 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -21930,6 +27923,12 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -21959,7 +27958,13 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21979,7 +27984,13 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -22001,6 +28012,12 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -22025,7 +28042,13 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -22045,7 +28068,13 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -22065,7 +28094,13 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -22084,7 +28119,13 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -22103,7 +28144,13 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -22123,7 +28170,13 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -22143,7 +28196,13 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -22165,7 +28224,13 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -22188,7 +28253,13 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -22223,6 +28294,12 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -22253,6 +28330,12 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -22282,7 +28365,13 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -22302,7 +28391,13 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -22324,6 +28419,12 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -22348,7 +28449,13 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -22368,7 +28475,13 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -22388,7 +28501,13 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -22407,7 +28526,13 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -22426,7 +28551,13 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -22446,7 +28577,13 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -22466,7 +28603,13 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -22488,7 +28631,13 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -22511,7 +28660,13 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -22546,6 +28701,12 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -22576,6 +28737,12 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -22605,7 +28772,13 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -22625,7 +28798,13 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -22647,6 +28826,12 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -22671,7 +28856,13 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -22691,7 +28882,13 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -22711,7 +28908,13 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -22730,7 +28933,13 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -22749,7 +28958,13 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -22769,7 +28984,13 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -22789,7 +29010,13 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -22811,7 +29038,13 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -22834,7 +29067,13 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -22869,6 +29108,12 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -22899,6 +29144,12 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -22928,7 +29179,13 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -22948,7 +29205,13 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -22970,6 +29233,12 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -22994,7 +29263,13 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -23014,7 +29289,13 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -23034,7 +29315,13 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -23053,7 +29340,13 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -23072,7 +29365,13 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -23092,7 +29391,13 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -23112,7 +29417,13 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -23134,7 +29445,13 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -23157,7 +29474,13 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll index 528195434610..ac60870b362b 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll @@ -14,6 +14,9 @@ define amdgpu_kernel void @global_volatile_load_0( ; GFX6-LABEL: global_volatile_load_0: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -41,13 +44,16 @@ define amdgpu_kernel void @global_volatile_load_0( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] glc -; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: flat_store_dword v[0:1], v2 @@ -55,29 +61,38 @@ define amdgpu_kernel void @global_volatile_load_0( ; ; GFX10-WGP-LABEL: global_volatile_load_0: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] glc dlc -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_volatile_load_0: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] glc dlc -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_volatile_load_0: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -102,62 +117,80 @@ define amdgpu_kernel void @global_volatile_load_0( ; ; GFX11-WGP-LABEL: global_volatile_load_0: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_volatile_load_0: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_volatile_load_0: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_volatile_load_0: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_volatile_load_0: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -171,6 +204,9 @@ define amdgpu_kernel void @global_volatile_load_1( ; GFX6-LABEL: global_volatile_load_1: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -187,11 +223,11 @@ define amdgpu_kernel void @global_volatile_load_1( ; GFX6-NEXT: s_mov_b32 s9, s6 ; GFX6-NEXT: ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] -; GFX6-NEXT: s_mov_b32 s8, 2 -; GFX6-NEXT: v_lshlrev_b32_e64 v0, s8, v0 -; GFX6-NEXT: v_mov_b32_e32 v2, 0 +; GFX6-NEXT: v_ashrrev_i32_e64 v2, 31, v0 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_mov_b32 s8, 2 +; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], s8 ; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -203,14 +239,17 @@ define amdgpu_kernel void @global_volatile_load_1( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 -; GFX7-NEXT: s_mov_b32 s6, 2 -; GFX7-NEXT: v_lshlrev_b32_e64 v1, s6, v0 -; GFX7-NEXT: v_mov_b32_e32 v0, 0 -; GFX7-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_ashrrev_i32_e64 v2, 31, v0 +; GFX7-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 2 +; GFX7-NEXT: v_lshl_b64 v[1:2], v[0:1], s6 ; GFX7-NEXT: s_mov_b32 s6, s8 ; GFX7-NEXT: v_mov_b32_e32 v0, v1 ; GFX7-NEXT: s_mov_b32 s8, s9 @@ -229,28 +268,60 @@ define amdgpu_kernel void @global_volatile_load_1( ; ; GFX10-WGP-LABEL: global_volatile_load_1: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_mov_b32 s8, 2 -; GFX10-WGP-NEXT: v_lshlrev_b32_e64 v1, s8, v1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: global_load_dword v1, v1, s[6:7] glc dlc +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_nop 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: v_ashrrev_i32_e64 v3, 31, v1 +; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_mov_b32 s6, 2 +; GFX10-WGP-NEXT: v_lshlrev_b64 v[2:3], s6, v[1:2] +; GFX10-WGP-NEXT: s_mov_b32 s7, s8 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v2 +; GFX10-WGP-NEXT: s_mov_b32 s6, s9 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: v_add_co_u32 v1, s7, s7, v1 +; GFX10-WGP-NEXT: v_add_co_ci_u32_e64 v3, s6, s6, v2, s7 +; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: global_load_dword v1, v[1:2], off glc dlc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_volatile_load_1: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_mov_b32 s8, 2 -; GFX10-CU-NEXT: v_lshlrev_b32_e64 v1, s8, v1 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: global_load_dword v1, v1, s[6:7] glc dlc +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX10-CU-NEXT: s_nop 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: v_ashrrev_i32_e64 v3, 31, v1 +; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_mov_b32 s6, 2 +; GFX10-CU-NEXT: v_lshlrev_b64 v[2:3], s6, v[1:2] +; GFX10-CU-NEXT: s_mov_b32 s7, s8 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, v2 +; GFX10-CU-NEXT: s_mov_b32 s6, s9 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: v_add_co_u32 v1, s7, s7, v1 +; GFX10-CU-NEXT: v_add_co_ci_u32_e64 v3, s6, s6, v2, s7 +; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: global_load_dword v1, v[1:2], off glc dlc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm @@ -258,6 +329,9 @@ define amdgpu_kernel void @global_volatile_load_1( ; SKIP-CACHE-INV-LABEL: global_volatile_load_1: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -274,11 +348,11 @@ define amdgpu_kernel void @global_volatile_load_1( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s9, s6 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], s[8:9] -; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, 2 -; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e64 v0, s8, v0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, 0 +; SKIP-CACHE-INV-NEXT: v_ashrrev_i32_e64 v2, 31, v0 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, 2 +; SKIP-CACHE-INV-NEXT: v_lshl_b64 v[0:1], v[0:1], s8 ; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -286,49 +360,96 @@ define amdgpu_kernel void @global_volatile_load_1( ; ; GFX11-WGP-LABEL: global_volatile_load_1: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[4:5] ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_mov_b32 s4, 0x3ff -; GFX11-WGP-NEXT: v_and_b32_e64 v1, v1, s4 -; GFX11-WGP-NEXT: s_mov_b32 s4, 2 -; GFX11-WGP-NEXT: v_lshlrev_b32_e64 v1, s4, v1 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: global_load_b32 v1, v1, s[2:3] glc dlc +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_mov_b32 s2, 0x3ff +; GFX11-WGP-NEXT: v_and_b32_e64 v1, v1, s2 +; GFX11-WGP-NEXT: v_ashrrev_i32_e64 v3, 31, v1 +; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_mov_b32 s2, 2 +; GFX11-WGP-NEXT: v_lshlrev_b64 v[2:3], s2, v[1:2] +; GFX11-WGP-NEXT: s_mov_b32 s3, s4 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-WGP-NEXT: s_mov_b32 s2, s5 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: v_add_co_u32 v1, s3, s3, v1 +; GFX11-WGP-NEXT: v_add_co_ci_u32_e64 v3, s2, s2, v2, s3 +; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: global_load_b32 v1, v[1:2], off glc dlc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_volatile_load_1: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[4:5] ; GFX11-CU-NEXT: v_mov_b32_e32 v1, v0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_mov_b32 s4, 0x3ff -; GFX11-CU-NEXT: v_and_b32_e64 v1, v1, s4 -; GFX11-CU-NEXT: s_mov_b32 s4, 2 -; GFX11-CU-NEXT: v_lshlrev_b32_e64 v1, s4, v1 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: global_load_b32 v1, v1, s[2:3] glc dlc +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_mov_b32 s2, 0x3ff +; GFX11-CU-NEXT: v_and_b32_e64 v1, v1, s2 +; GFX11-CU-NEXT: v_ashrrev_i32_e64 v3, 31, v1 +; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_mov_b32 s2, 2 +; GFX11-CU-NEXT: v_lshlrev_b64 v[2:3], s2, v[1:2] +; GFX11-CU-NEXT: s_mov_b32 s3, s4 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-CU-NEXT: s_mov_b32 s2, s5 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: v_add_co_u32 v1, s3, s3, v1 +; GFX11-CU-NEXT: v_add_co_ci_u32_e64 v3, s2, s2, v2, s3 +; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: global_load_b32 v1, v[1:2], off glc dlc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_volatile_load_1: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[4:5] ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_mov_b32 s4, 0x3ff -; GFX12-WGP-NEXT: v_and_b32_e64 v1, v1, s4 -; GFX12-WGP-NEXT: s_mov_b32 s4, 2 -; GFX12-WGP-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-WGP-NEXT: v_lshlrev_b32_e64 v1, s4, v1 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 -; GFX12-WGP-NEXT: global_load_b32 v1, v1, s[2:3] scope:SCOPE_SYS +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_mov_b32 s2, 0x3ff +; GFX12-WGP-NEXT: v_and_b32_e64 v1, v1, s2 +; GFX12-WGP-NEXT: v_ashrrev_i32_e64 v3, 31, v1 +; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: s_mov_b32 s2, 2 +; GFX12-WGP-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-WGP-NEXT: v_lshlrev_b64_e64 v[2:3], s2, v[1:2] +; GFX12-WGP-NEXT: s_mov_b32 s3, s4 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-WGP-NEXT: s_mov_b32 s2, s5 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-WGP-NEXT: v_add_co_u32 v1, s3, s3, v1 +; GFX12-WGP-NEXT: s_wait_alu depctr_va_sdst(0) +; GFX12-WGP-NEXT: v_add_co_ci_u32_e64 v3, s2, s2, v2, s3 +; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_load_b32 v1, v[1:2], off scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -337,17 +458,34 @@ define amdgpu_kernel void @global_volatile_load_1( ; ; GFX12-CU-LABEL: global_volatile_load_1: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[4:5] ; GFX12-CU-NEXT: v_mov_b32_e32 v1, v0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_mov_b32 s4, 0x3ff -; GFX12-CU-NEXT: v_and_b32_e64 v1, v1, s4 -; GFX12-CU-NEXT: s_mov_b32 s4, 2 -; GFX12-CU-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-CU-NEXT: v_lshlrev_b32_e64 v1, s4, v1 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 -; GFX12-CU-NEXT: global_load_b32 v1, v1, s[2:3] scope:SCOPE_SYS +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_mov_b32 s2, 0x3ff +; GFX12-CU-NEXT: v_and_b32_e64 v1, v1, s2 +; GFX12-CU-NEXT: v_ashrrev_i32_e64 v3, 31, v1 +; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: s_mov_b32 s2, 2 +; GFX12-CU-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-CU-NEXT: v_lshlrev_b64_e64 v[2:3], s2, v[1:2] +; GFX12-CU-NEXT: s_mov_b32 s3, s4 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-CU-NEXT: s_mov_b32 s2, s5 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-CU-NEXT: v_add_co_u32 v1, s3, s3, v1 +; GFX12-CU-NEXT: s_wait_alu depctr_va_sdst(0) +; GFX12-CU-NEXT: v_add_co_ci_u32_e64 v3, s2, s2, v2, s3 +; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_load_b32 v1, v[1:2], off scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -358,16 +496,20 @@ define amdgpu_kernel void @global_volatile_load_1( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: v_mov_b32_e32 v1, v0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b32 s4, 0x3ff ; GFX1250-NEXT: v_and_b32_e64 v1, v1, s4 ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_load_b32 v1, v1, s[2:3] scale_offset scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -383,6 +525,9 @@ define amdgpu_kernel void @global_volatile_store_0( ; GFX6-LABEL: global_volatile_store_0: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -407,6 +552,10 @@ define amdgpu_kernel void @global_volatile_store_0( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 @@ -420,10 +569,13 @@ define amdgpu_kernel void @global_volatile_store_0( ; ; GFX10-WGP-LABEL: global_volatile_store_0: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -433,10 +585,13 @@ define amdgpu_kernel void @global_volatile_store_0( ; ; GFX10-CU-LABEL: global_volatile_store_0: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -447,6 +602,9 @@ define amdgpu_kernel void @global_volatile_store_0( ; SKIP-CACHE-INV-LABEL: global_volatile_store_0: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -467,10 +625,13 @@ define amdgpu_kernel void @global_volatile_store_0( ; ; GFX11-WGP-LABEL: global_volatile_store_0: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -480,10 +641,13 @@ define amdgpu_kernel void @global_volatile_store_0( ; ; GFX11-CU-LABEL: global_volatile_store_0: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -493,10 +657,13 @@ define amdgpu_kernel void @global_volatile_store_0( ; ; GFX12-WGP-LABEL: global_volatile_store_0: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -511,10 +678,13 @@ define amdgpu_kernel void @global_volatile_store_0( ; ; GFX12-CU-LABEL: global_volatile_store_0: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -530,10 +700,13 @@ define amdgpu_kernel void @global_volatile_store_0( ; GFX1250-LABEL: global_volatile_store_0: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 @@ -551,9 +724,12 @@ entry: define amdgpu_kernel void @global_volatile_store_1( ; GFX6-LABEL: global_volatile_store_1: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb +; GFX6-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x9 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb ; GFX6-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -561,11 +737,11 @@ define amdgpu_kernel void @global_volatile_store_1( ; GFX6-NEXT: s_mov_b32 s7, s2 ; GFX6-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] +; GFX6-NEXT: v_ashrrev_i32_e64 v2, 31, v0 +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_mov_b32 s5, 2 -; GFX6-NEXT: v_lshlrev_b32_e64 v1, s5, v0 -; GFX6-NEXT: v_mov_b32_e32 v0, 0 -; GFX6-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; GFX6-NEXT: v_mov_b32_e32 v2, v0 +; GFX6-NEXT: v_lshl_b64 v[1:2], v[0:1], s5 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 @@ -578,14 +754,18 @@ define amdgpu_kernel void @global_volatile_store_1( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX7-NEXT: v_ashrrev_i32_e64 v2, 31, v0 +; GFX7-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_mov_b32 s5, 2 -; GFX7-NEXT: v_lshlrev_b32_e64 v1, s5, v0 -; GFX7-NEXT: v_mov_b32_e32 v0, 0 -; GFX7-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_lshl_b64 v[1:2], v[0:1], s5 ; GFX7-NEXT: s_mov_b32 s6, s8 ; GFX7-NEXT: v_mov_b32_e32 v0, v1 ; GFX7-NEXT: s_mov_b32 s5, s9 @@ -603,37 +783,72 @@ define amdgpu_kernel void @global_volatile_store_1( ; ; GFX10-WGP-LABEL: global_volatile_store_1: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_mov_b32 s7, 2 -; GFX10-WGP-NEXT: v_lshlrev_b32_e64 v0, s7, v0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_nop 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 -; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX10-WGP-NEXT: v_ashrrev_i32_e64 v2, 31, v0 +; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v2 +; GFX10-WGP-NEXT: s_mov_b32 s5, 2 +; GFX10-WGP-NEXT: v_lshlrev_b64 v[1:2], s5, v[0:1] +; GFX10-WGP-NEXT: s_mov_b32 s6, s8 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, v1 +; GFX10-WGP-NEXT: s_mov_b32 s5, s9 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v2 +; GFX10-WGP-NEXT: v_add_co_u32 v0, s6, s6, v0 +; GFX10-WGP-NEXT: v_add_co_ci_u32_e64 v2, s5, s5, v1, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v2 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_volatile_store_1: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x0 -; GFX10-CU-NEXT: s_mov_b32 s7, 2 -; GFX10-CU-NEXT: v_lshlrev_b32_e64 v0, s7, v0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_nop 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 -; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-CU-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX10-CU-NEXT: v_ashrrev_i32_e64 v2, 31, v0 +; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v1, v2 +; GFX10-CU-NEXT: s_mov_b32 s5, 2 +; GFX10-CU-NEXT: v_lshlrev_b64 v[1:2], s5, v[0:1] +; GFX10-CU-NEXT: s_mov_b32 s6, s8 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, v1 +; GFX10-CU-NEXT: s_mov_b32 s5, s9 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, v2 +; GFX10-CU-NEXT: v_add_co_u32 v0, s6, s6, v0 +; GFX10-CU-NEXT: v_add_co_ci_u32_e64 v2, s5, s5, v1, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v1, v2 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_volatile_store_1: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, 0xf000 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0 @@ -641,11 +856,11 @@ define amdgpu_kernel void @global_volatile_store_1( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s2 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[6:7] +; SKIP-CACHE-INV-NEXT: v_ashrrev_i32_e64 v2, 31, v0 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, 2 -; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e64 v1, s5, v0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, 0 -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, v0 +; SKIP-CACHE-INV-NEXT: v_lshl_b64 v[1:2], v[0:1], s5 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 @@ -654,86 +869,152 @@ define amdgpu_kernel void @global_volatile_store_1( ; ; GFX11-WGP-LABEL: global_volatile_store_1: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_mov_b32 s3, 0x3ff -; GFX11-WGP-NEXT: v_and_b32_e64 v0, v0, s3 -; GFX11-WGP-NEXT: s_mov_b32 s3, 2 -; GFX11-WGP-NEXT: v_lshlrev_b32_e64 v0, s3, v0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[4:5], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] dlc +; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-WGP-NEXT: s_mov_b32 s1, 0x3ff +; GFX11-WGP-NEXT: v_and_b32_e64 v0, v0, s1 +; GFX11-WGP-NEXT: v_ashrrev_i32_e64 v2, 31, v0 +; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-WGP-NEXT: s_mov_b32 s1, 2 +; GFX11-WGP-NEXT: v_lshlrev_b64 v[1:2], s1, v[0:1] +; GFX11-WGP-NEXT: s_mov_b32 s2, s4 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, v1 +; GFX11-WGP-NEXT: s_mov_b32 s1, s5 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-WGP-NEXT: v_add_co_u32 v0, s2, s2, v0 +; GFX11-WGP-NEXT: v_add_co_ci_u32_e64 v2, s1, s1, v1, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: global_store_b32 v[0:1], v2, off dlc ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_volatile_store_1: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x0 -; GFX11-CU-NEXT: s_mov_b32 s3, 0x3ff -; GFX11-CU-NEXT: v_and_b32_e64 v0, v0, s3 -; GFX11-CU-NEXT: s_mov_b32 s3, 2 -; GFX11-CU-NEXT: v_lshlrev_b32_e64 v0, s3, v0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[4:5], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] dlc +; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-CU-NEXT: s_mov_b32 s1, 0x3ff +; GFX11-CU-NEXT: v_and_b32_e64 v0, v0, s1 +; GFX11-CU-NEXT: v_ashrrev_i32_e64 v2, 31, v0 +; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-CU-NEXT: s_mov_b32 s1, 2 +; GFX11-CU-NEXT: v_lshlrev_b64 v[1:2], s1, v[0:1] +; GFX11-CU-NEXT: s_mov_b32 s2, s4 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, v1 +; GFX11-CU-NEXT: s_mov_b32 s1, s5 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-CU-NEXT: v_add_co_u32 v0, s2, s2, v0 +; GFX11-CU-NEXT: v_add_co_ci_u32_e64 v2, s1, s1, v1, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: global_store_b32 v[0:1], v2, off dlc ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_volatile_store_1: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_mov_b32 s3, 0x3ff -; GFX12-WGP-NEXT: v_and_b32_e64 v0, v0, s3 -; GFX12-WGP-NEXT: s_mov_b32 s3, 2 -; GFX12-WGP-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-WGP-NEXT: v_lshlrev_b32_e64 v0, s3, v0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[4:5], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 -; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX12-WGP-NEXT: s_mov_b32 s1, 0x3ff +; GFX12-WGP-NEXT: v_and_b32_e64 v0, v0, s1 +; GFX12-WGP-NEXT: v_ashrrev_i32_e64 v2, 31, v0 +; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-WGP-NEXT: s_mov_b32 s1, 2 +; GFX12-WGP-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-WGP-NEXT: v_lshlrev_b64_e64 v[1:2], s1, v[0:1] +; GFX12-WGP-NEXT: s_mov_b32 s2, s4 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, v1 +; GFX12-WGP-NEXT: s_mov_b32 s1, s5 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-WGP-NEXT: v_add_co_u32 v0, s2, s2, v0 +; GFX12-WGP-NEXT: s_wait_alu depctr_sa_sdst(0) depctr_va_sdst(0) +; GFX12-WGP-NEXT: v_add_co_ci_u32_e64 v2, s1, s1, v1, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX12-WGP-NEXT: global_store_b32 v[0:1], v2, off scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_volatile_store_1: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x0 -; GFX12-CU-NEXT: s_mov_b32 s3, 0x3ff -; GFX12-CU-NEXT: v_and_b32_e64 v0, v0, s3 -; GFX12-CU-NEXT: s_mov_b32 s3, 2 -; GFX12-CU-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-CU-NEXT: v_lshlrev_b32_e64 v0, s3, v0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[4:5], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 -; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX12-CU-NEXT: s_mov_b32 s1, 0x3ff +; GFX12-CU-NEXT: v_and_b32_e64 v0, v0, s1 +; GFX12-CU-NEXT: v_ashrrev_i32_e64 v2, 31, v0 +; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-CU-NEXT: s_mov_b32 s1, 2 +; GFX12-CU-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-CU-NEXT: v_lshlrev_b64_e64 v[1:2], s1, v[0:1] +; GFX12-CU-NEXT: s_mov_b32 s2, s4 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, v1 +; GFX12-CU-NEXT: s_mov_b32 s1, s5 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-CU-NEXT: v_add_co_u32 v0, s2, s2, v0 +; GFX12-CU-NEXT: s_wait_alu depctr_sa_sdst(0) depctr_va_sdst(0) +; GFX12-CU-NEXT: v_add_co_ci_u32_e64 v2, s1, s1, v1, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX12-CU-NEXT: global_store_b32 v[0:1], v2, off scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_volatile_store_1: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv +; GFX1250-NEXT: s_nop 0 ; GFX1250-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b32 s3, 0x3ff @@ -757,6 +1038,9 @@ define amdgpu_kernel void @global_volatile_workgroup_acquire_load( ; GFX6-LABEL: global_volatile_workgroup_acquire_load: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -784,12 +1068,16 @@ define amdgpu_kernel void @global_volatile_workgroup_acquire_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -798,30 +1086,40 @@ define amdgpu_kernel void @global_volatile_workgroup_acquire_load( ; ; GFX10-WGP-LABEL: global_volatile_workgroup_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_volatile_workgroup_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_volatile_workgroup_acquire_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -846,59 +1144,78 @@ define amdgpu_kernel void @global_volatile_workgroup_acquire_load( ; ; GFX11-WGP-LABEL: global_volatile_workgroup_acquire_load: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_volatile_workgroup_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_volatile_workgroup_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_volatile_workgroup_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_volatile_workgroup_acquire_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -912,6 +1229,9 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store( ; GFX6-LABEL: global_volatile_workgroup_release_store: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb ; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -934,6 +1254,10 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -945,10 +1269,13 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store( ; ; GFX10-WGP-LABEL: global_volatile_workgroup_release_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -957,10 +1284,13 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store( ; ; GFX10-CU-LABEL: global_volatile_workgroup_release_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -970,6 +1300,9 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store( ; SKIP-CACHE-INV-LABEL: global_volatile_workgroup_release_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -988,10 +1321,13 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store( ; ; GFX11-WGP-LABEL: global_volatile_workgroup_release_store: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1000,10 +1336,13 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store( ; ; GFX11-CU-LABEL: global_volatile_workgroup_release_store: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1012,43 +1351,55 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store( ; ; GFX12-WGP-LABEL: global_volatile_workgroup_release_store: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_volatile_workgroup_release_store: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_volatile_workgroup_release_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll index 0e22aadb6add..1d28698bfa28 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll @@ -18,6 +18,9 @@ define amdgpu_kernel void @global_wavefront_unordered_load( ; GFX6-LABEL: global_wavefront_unordered_load: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -45,12 +48,16 @@ define amdgpu_kernel void @global_wavefront_unordered_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -59,29 +66,38 @@ define amdgpu_kernel void @global_wavefront_unordered_load( ; ; GFX10-WGP-LABEL: global_wavefront_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_unordered_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -106,101 +122,131 @@ define amdgpu_kernel void @global_wavefront_unordered_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_unordered_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_wavefront_unordered_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_unordered_load: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_unordered_load: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_wavefront_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_wavefront_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_wavefront_unordered_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -214,6 +260,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_load( ; GFX6-LABEL: global_wavefront_monotonic_load: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -241,12 +290,16 @@ define amdgpu_kernel void @global_wavefront_monotonic_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -255,29 +308,38 @@ define amdgpu_kernel void @global_wavefront_monotonic_load( ; ; GFX10-WGP-LABEL: global_wavefront_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_monotonic_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -302,101 +364,131 @@ define amdgpu_kernel void @global_wavefront_monotonic_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_monotonic_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_wavefront_monotonic_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_monotonic_load: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_monotonic_load: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_wavefront_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_wavefront_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_wavefront_monotonic_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -410,6 +502,9 @@ define amdgpu_kernel void @global_wavefront_acquire_load( ; GFX6-LABEL: global_wavefront_acquire_load: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -437,12 +532,16 @@ define amdgpu_kernel void @global_wavefront_acquire_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -451,29 +550,38 @@ define amdgpu_kernel void @global_wavefront_acquire_load( ; ; GFX10-WGP-LABEL: global_wavefront_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_acquire_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -498,101 +606,131 @@ define amdgpu_kernel void @global_wavefront_acquire_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_acquire_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_wavefront_acquire_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_acquire_load: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_wavefront_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_wavefront_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_wavefront_acquire_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -606,6 +744,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_load( ; GFX6-LABEL: global_wavefront_seq_cst_load: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -633,12 +774,16 @@ define amdgpu_kernel void @global_wavefront_seq_cst_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -647,29 +792,38 @@ define amdgpu_kernel void @global_wavefront_seq_cst_load( ; ; GFX10-WGP-LABEL: global_wavefront_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_seq_cst_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -694,101 +848,131 @@ define amdgpu_kernel void @global_wavefront_seq_cst_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_wavefront_seq_cst_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_seq_cst_load: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_seq_cst_load: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_wavefront_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_wavefront_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_wavefront_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -802,6 +986,9 @@ define amdgpu_kernel void @global_wavefront_unordered_store( ; GFX6-LABEL: global_wavefront_unordered_store: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -823,6 +1010,10 @@ define amdgpu_kernel void @global_wavefront_unordered_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -833,27 +1024,38 @@ define amdgpu_kernel void @global_wavefront_unordered_store( ; ; GFX10-WGP-LABEL: global_wavefront_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_unordered_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -871,93 +1073,129 @@ define amdgpu_kernel void @global_wavefront_unordered_store( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_unordered_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_wavefront_unordered_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_unordered_store: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_unordered_store: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_wavefront_unordered_store: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_wavefront_unordered_store: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_wavefront_unordered_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -970,6 +1208,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_store( ; GFX6-LABEL: global_wavefront_monotonic_store: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -991,6 +1232,10 @@ define amdgpu_kernel void @global_wavefront_monotonic_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -1001,27 +1246,38 @@ define amdgpu_kernel void @global_wavefront_monotonic_store( ; ; GFX10-WGP-LABEL: global_wavefront_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_monotonic_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -1039,93 +1295,129 @@ define amdgpu_kernel void @global_wavefront_monotonic_store( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_monotonic_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_wavefront_monotonic_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_monotonic_store: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_monotonic_store: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_wavefront_monotonic_store: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_wavefront_monotonic_store: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_wavefront_monotonic_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -1138,6 +1430,9 @@ define amdgpu_kernel void @global_wavefront_release_store( ; GFX6-LABEL: global_wavefront_release_store: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -1159,6 +1454,10 @@ define amdgpu_kernel void @global_wavefront_release_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -1169,27 +1468,38 @@ define amdgpu_kernel void @global_wavefront_release_store( ; ; GFX10-WGP-LABEL: global_wavefront_release_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_release_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_release_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -1207,93 +1517,129 @@ define amdgpu_kernel void @global_wavefront_release_store( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_release_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_wavefront_release_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_release_store: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_release_store: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_wavefront_release_store: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_wavefront_release_store: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_wavefront_release_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -1306,6 +1652,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_store( ; GFX6-LABEL: global_wavefront_seq_cst_store: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -1327,6 +1676,10 @@ define amdgpu_kernel void @global_wavefront_seq_cst_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -1337,27 +1690,38 @@ define amdgpu_kernel void @global_wavefront_seq_cst_store( ; ; GFX10-WGP-LABEL: global_wavefront_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_seq_cst_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -1375,93 +1739,129 @@ define amdgpu_kernel void @global_wavefront_seq_cst_store( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_wavefront_seq_cst_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_seq_cst_store: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_seq_cst_store: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_wavefront_seq_cst_store: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_wavefront_seq_cst_store: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_wavefront_seq_cst_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -1474,6 +1874,10 @@ define amdgpu_kernel void @global_wavefront_monotonic_atomicrmw( ; GFX6-LABEL: global_wavefront_monotonic_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -1493,18 +1897,26 @@ define amdgpu_kernel void @global_wavefront_monotonic_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_wavefront_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1514,7 +1926,11 @@ define amdgpu_kernel void @global_wavefront_monotonic_atomicrmw( ; ; GFX10-CU-LABEL: global_wavefront_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1525,6 +1941,10 @@ define amdgpu_kernel void @global_wavefront_monotonic_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_wavefront_monotonic_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -1541,7 +1961,11 @@ define amdgpu_kernel void @global_wavefront_monotonic_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1551,7 +1975,11 @@ define amdgpu_kernel void @global_wavefront_monotonic_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1561,7 +1989,11 @@ define amdgpu_kernel void @global_wavefront_monotonic_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_monotonic_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1571,7 +2003,11 @@ define amdgpu_kernel void @global_wavefront_monotonic_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_wavefront_monotonic_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1581,7 +2017,11 @@ define amdgpu_kernel void @global_wavefront_monotonic_atomicrmw( ; ; GFX11-WGP-LABEL: global_wavefront_monotonic_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1591,7 +2031,11 @@ define amdgpu_kernel void @global_wavefront_monotonic_atomicrmw( ; ; GFX11-CU-LABEL: global_wavefront_monotonic_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1601,7 +2045,11 @@ define amdgpu_kernel void @global_wavefront_monotonic_atomicrmw( ; ; GFX12-WGP-LABEL: global_wavefront_monotonic_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -1611,7 +2059,11 @@ define amdgpu_kernel void @global_wavefront_monotonic_atomicrmw( ; ; GFX12-CU-LABEL: global_wavefront_monotonic_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -1622,7 +2074,11 @@ define amdgpu_kernel void @global_wavefront_monotonic_atomicrmw( ; GFX1250-LABEL: global_wavefront_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -1640,6 +2096,10 @@ define amdgpu_kernel void @global_wavefront_acquire_atomicrmw( ; GFX6-LABEL: global_wavefront_acquire_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -1659,18 +2119,26 @@ define amdgpu_kernel void @global_wavefront_acquire_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_wavefront_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1680,7 +2148,11 @@ define amdgpu_kernel void @global_wavefront_acquire_atomicrmw( ; ; GFX10-CU-LABEL: global_wavefront_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1691,6 +2163,10 @@ define amdgpu_kernel void @global_wavefront_acquire_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_wavefront_acquire_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -1707,7 +2183,11 @@ define amdgpu_kernel void @global_wavefront_acquire_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1717,7 +2197,11 @@ define amdgpu_kernel void @global_wavefront_acquire_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1727,7 +2211,11 @@ define amdgpu_kernel void @global_wavefront_acquire_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_acquire_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1737,7 +2225,11 @@ define amdgpu_kernel void @global_wavefront_acquire_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_wavefront_acquire_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1747,7 +2239,11 @@ define amdgpu_kernel void @global_wavefront_acquire_atomicrmw( ; ; GFX11-WGP-LABEL: global_wavefront_acquire_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1757,7 +2253,11 @@ define amdgpu_kernel void @global_wavefront_acquire_atomicrmw( ; ; GFX11-CU-LABEL: global_wavefront_acquire_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1767,7 +2267,11 @@ define amdgpu_kernel void @global_wavefront_acquire_atomicrmw( ; ; GFX12-WGP-LABEL: global_wavefront_acquire_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -1777,7 +2281,11 @@ define amdgpu_kernel void @global_wavefront_acquire_atomicrmw( ; ; GFX12-CU-LABEL: global_wavefront_acquire_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -1788,7 +2296,11 @@ define amdgpu_kernel void @global_wavefront_acquire_atomicrmw( ; GFX1250-LABEL: global_wavefront_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -1806,6 +2318,10 @@ define amdgpu_kernel void @global_wavefront_release_atomicrmw( ; GFX6-LABEL: global_wavefront_release_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -1825,18 +2341,26 @@ define amdgpu_kernel void @global_wavefront_release_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_wavefront_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1846,7 +2370,11 @@ define amdgpu_kernel void @global_wavefront_release_atomicrmw( ; ; GFX10-CU-LABEL: global_wavefront_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1857,6 +2385,10 @@ define amdgpu_kernel void @global_wavefront_release_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_wavefront_release_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -1873,7 +2405,11 @@ define amdgpu_kernel void @global_wavefront_release_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1883,7 +2419,11 @@ define amdgpu_kernel void @global_wavefront_release_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1893,7 +2433,11 @@ define amdgpu_kernel void @global_wavefront_release_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_release_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1903,7 +2447,11 @@ define amdgpu_kernel void @global_wavefront_release_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_wavefront_release_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1913,7 +2461,11 @@ define amdgpu_kernel void @global_wavefront_release_atomicrmw( ; ; GFX11-WGP-LABEL: global_wavefront_release_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1923,7 +2475,11 @@ define amdgpu_kernel void @global_wavefront_release_atomicrmw( ; ; GFX11-CU-LABEL: global_wavefront_release_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1933,7 +2489,11 @@ define amdgpu_kernel void @global_wavefront_release_atomicrmw( ; ; GFX12-WGP-LABEL: global_wavefront_release_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -1943,7 +2503,11 @@ define amdgpu_kernel void @global_wavefront_release_atomicrmw( ; ; GFX12-CU-LABEL: global_wavefront_release_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -1954,7 +2518,11 @@ define amdgpu_kernel void @global_wavefront_release_atomicrmw( ; GFX1250-LABEL: global_wavefront_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -1972,6 +2540,10 @@ define amdgpu_kernel void @global_wavefront_acq_rel_atomicrmw( ; GFX6-LABEL: global_wavefront_acq_rel_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -1991,18 +2563,26 @@ define amdgpu_kernel void @global_wavefront_acq_rel_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_wavefront_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2012,7 +2592,11 @@ define amdgpu_kernel void @global_wavefront_acq_rel_atomicrmw( ; ; GFX10-CU-LABEL: global_wavefront_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2023,6 +2607,10 @@ define amdgpu_kernel void @global_wavefront_acq_rel_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_wavefront_acq_rel_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -2039,7 +2627,11 @@ define amdgpu_kernel void @global_wavefront_acq_rel_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2049,7 +2641,11 @@ define amdgpu_kernel void @global_wavefront_acq_rel_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2059,7 +2655,11 @@ define amdgpu_kernel void @global_wavefront_acq_rel_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2069,7 +2669,11 @@ define amdgpu_kernel void @global_wavefront_acq_rel_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_wavefront_acq_rel_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2079,7 +2683,11 @@ define amdgpu_kernel void @global_wavefront_acq_rel_atomicrmw( ; ; GFX11-WGP-LABEL: global_wavefront_acq_rel_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2089,7 +2697,11 @@ define amdgpu_kernel void @global_wavefront_acq_rel_atomicrmw( ; ; GFX11-CU-LABEL: global_wavefront_acq_rel_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2099,7 +2711,11 @@ define amdgpu_kernel void @global_wavefront_acq_rel_atomicrmw( ; ; GFX12-WGP-LABEL: global_wavefront_acq_rel_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -2109,7 +2725,11 @@ define amdgpu_kernel void @global_wavefront_acq_rel_atomicrmw( ; ; GFX12-CU-LABEL: global_wavefront_acq_rel_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -2120,7 +2740,11 @@ define amdgpu_kernel void @global_wavefront_acq_rel_atomicrmw( ; GFX1250-LABEL: global_wavefront_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2138,6 +2762,10 @@ define amdgpu_kernel void @global_wavefront_seq_cst_atomicrmw( ; GFX6-LABEL: global_wavefront_seq_cst_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -2157,18 +2785,26 @@ define amdgpu_kernel void @global_wavefront_seq_cst_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_wavefront_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2178,7 +2814,11 @@ define amdgpu_kernel void @global_wavefront_seq_cst_atomicrmw( ; ; GFX10-CU-LABEL: global_wavefront_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2189,6 +2829,10 @@ define amdgpu_kernel void @global_wavefront_seq_cst_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_wavefront_seq_cst_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -2205,7 +2849,11 @@ define amdgpu_kernel void @global_wavefront_seq_cst_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2215,7 +2863,11 @@ define amdgpu_kernel void @global_wavefront_seq_cst_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2225,7 +2877,11 @@ define amdgpu_kernel void @global_wavefront_seq_cst_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2235,7 +2891,11 @@ define amdgpu_kernel void @global_wavefront_seq_cst_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_wavefront_seq_cst_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2245,7 +2905,11 @@ define amdgpu_kernel void @global_wavefront_seq_cst_atomicrmw( ; ; GFX11-WGP-LABEL: global_wavefront_seq_cst_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2255,7 +2919,11 @@ define amdgpu_kernel void @global_wavefront_seq_cst_atomicrmw( ; ; GFX11-CU-LABEL: global_wavefront_seq_cst_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2265,7 +2933,11 @@ define amdgpu_kernel void @global_wavefront_seq_cst_atomicrmw( ; ; GFX12-WGP-LABEL: global_wavefront_seq_cst_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -2275,7 +2947,11 @@ define amdgpu_kernel void @global_wavefront_seq_cst_atomicrmw( ; ; GFX12-CU-LABEL: global_wavefront_seq_cst_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -2286,7 +2962,11 @@ define amdgpu_kernel void @global_wavefront_seq_cst_atomicrmw( ; GFX1250-LABEL: global_wavefront_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2304,6 +2984,10 @@ define amdgpu_kernel void @global_wavefront_acquire_ret_atomicrmw( ; GFX6-LABEL: global_wavefront_acquire_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -2326,6 +3010,10 @@ define amdgpu_kernel void @global_wavefront_acquire_ret_atomicrmw( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -2340,7 +3028,11 @@ define amdgpu_kernel void @global_wavefront_acquire_ret_atomicrmw( ; ; GFX10-WGP-LABEL: global_wavefront_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2352,7 +3044,11 @@ define amdgpu_kernel void @global_wavefront_acquire_ret_atomicrmw( ; ; GFX10-CU-LABEL: global_wavefront_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2365,6 +3061,10 @@ define amdgpu_kernel void @global_wavefront_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_wavefront_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -2383,7 +3083,11 @@ define amdgpu_kernel void @global_wavefront_acquire_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2395,7 +3099,11 @@ define amdgpu_kernel void @global_wavefront_acquire_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2407,7 +3115,11 @@ define amdgpu_kernel void @global_wavefront_acquire_ret_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_acquire_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2419,7 +3131,11 @@ define amdgpu_kernel void @global_wavefront_acquire_ret_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_wavefront_acquire_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2431,7 +3147,11 @@ define amdgpu_kernel void @global_wavefront_acquire_ret_atomicrmw( ; ; GFX11-WGP-LABEL: global_wavefront_acquire_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2443,7 +3163,11 @@ define amdgpu_kernel void @global_wavefront_acquire_ret_atomicrmw( ; ; GFX11-CU-LABEL: global_wavefront_acquire_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2455,7 +3179,11 @@ define amdgpu_kernel void @global_wavefront_acquire_ret_atomicrmw( ; ; GFX12-WGP-LABEL: global_wavefront_acquire_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -2467,7 +3195,11 @@ define amdgpu_kernel void @global_wavefront_acquire_ret_atomicrmw( ; ; GFX12-CU-LABEL: global_wavefront_acquire_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -2480,7 +3212,11 @@ define amdgpu_kernel void @global_wavefront_acquire_ret_atomicrmw( ; GFX1250-LABEL: global_wavefront_acquire_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2501,6 +3237,10 @@ define amdgpu_kernel void @global_wavefront_acq_rel_ret_atomicrmw( ; GFX6-LABEL: global_wavefront_acq_rel_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -2523,6 +3263,10 @@ define amdgpu_kernel void @global_wavefront_acq_rel_ret_atomicrmw( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -2537,7 +3281,11 @@ define amdgpu_kernel void @global_wavefront_acq_rel_ret_atomicrmw( ; ; GFX10-WGP-LABEL: global_wavefront_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2549,7 +3297,11 @@ define amdgpu_kernel void @global_wavefront_acq_rel_ret_atomicrmw( ; ; GFX10-CU-LABEL: global_wavefront_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2562,6 +3314,10 @@ define amdgpu_kernel void @global_wavefront_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_wavefront_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -2580,7 +3336,11 @@ define amdgpu_kernel void @global_wavefront_acq_rel_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2592,7 +3352,11 @@ define amdgpu_kernel void @global_wavefront_acq_rel_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2604,7 +3368,11 @@ define amdgpu_kernel void @global_wavefront_acq_rel_ret_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2616,7 +3384,11 @@ define amdgpu_kernel void @global_wavefront_acq_rel_ret_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_wavefront_acq_rel_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2628,7 +3400,11 @@ define amdgpu_kernel void @global_wavefront_acq_rel_ret_atomicrmw( ; ; GFX11-WGP-LABEL: global_wavefront_acq_rel_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2640,7 +3416,11 @@ define amdgpu_kernel void @global_wavefront_acq_rel_ret_atomicrmw( ; ; GFX11-CU-LABEL: global_wavefront_acq_rel_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2652,7 +3432,11 @@ define amdgpu_kernel void @global_wavefront_acq_rel_ret_atomicrmw( ; ; GFX12-WGP-LABEL: global_wavefront_acq_rel_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -2664,7 +3448,11 @@ define amdgpu_kernel void @global_wavefront_acq_rel_ret_atomicrmw( ; ; GFX12-CU-LABEL: global_wavefront_acq_rel_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -2677,7 +3465,11 @@ define amdgpu_kernel void @global_wavefront_acq_rel_ret_atomicrmw( ; GFX1250-LABEL: global_wavefront_acq_rel_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2698,6 +3490,10 @@ define amdgpu_kernel void @global_wavefront_seq_cst_ret_atomicrmw( ; GFX6-LABEL: global_wavefront_seq_cst_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -2720,6 +3516,10 @@ define amdgpu_kernel void @global_wavefront_seq_cst_ret_atomicrmw( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -2734,7 +3534,11 @@ define amdgpu_kernel void @global_wavefront_seq_cst_ret_atomicrmw( ; ; GFX10-WGP-LABEL: global_wavefront_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2746,7 +3550,11 @@ define amdgpu_kernel void @global_wavefront_seq_cst_ret_atomicrmw( ; ; GFX10-CU-LABEL: global_wavefront_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2759,6 +3567,10 @@ define amdgpu_kernel void @global_wavefront_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_wavefront_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -2777,7 +3589,11 @@ define amdgpu_kernel void @global_wavefront_seq_cst_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2789,7 +3605,11 @@ define amdgpu_kernel void @global_wavefront_seq_cst_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2801,7 +3621,11 @@ define amdgpu_kernel void @global_wavefront_seq_cst_ret_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2813,7 +3637,11 @@ define amdgpu_kernel void @global_wavefront_seq_cst_ret_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_wavefront_seq_cst_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2825,7 +3653,11 @@ define amdgpu_kernel void @global_wavefront_seq_cst_ret_atomicrmw( ; ; GFX11-WGP-LABEL: global_wavefront_seq_cst_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2837,7 +3669,11 @@ define amdgpu_kernel void @global_wavefront_seq_cst_ret_atomicrmw( ; ; GFX11-CU-LABEL: global_wavefront_seq_cst_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2849,7 +3685,11 @@ define amdgpu_kernel void @global_wavefront_seq_cst_ret_atomicrmw( ; ; GFX12-WGP-LABEL: global_wavefront_seq_cst_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -2861,7 +3701,11 @@ define amdgpu_kernel void @global_wavefront_seq_cst_ret_atomicrmw( ; ; GFX12-CU-LABEL: global_wavefront_seq_cst_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -2874,7 +3718,11 @@ define amdgpu_kernel void @global_wavefront_seq_cst_ret_atomicrmw( ; GFX1250-LABEL: global_wavefront_seq_cst_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2896,6 +3744,12 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -2920,11 +3774,16 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -2934,6 +3793,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -2944,7 +3804,13 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -2958,7 +3824,13 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -2974,6 +3846,12 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -2994,7 +3872,13 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3008,7 +3892,13 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3022,7 +3912,13 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_monotonic_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -3036,7 +3932,13 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_wavefront_monotonic_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -3050,7 +3952,13 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3064,7 +3972,13 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3078,7 +3992,13 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3092,7 +4012,13 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3107,7 +4033,13 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_cmpxchg( ; GFX1250-LABEL: global_wavefront_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -3131,6 +4063,12 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -3155,11 +4093,16 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -3169,6 +4112,7 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -3179,7 +4123,13 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3193,7 +4143,13 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3209,6 +4165,12 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -3229,7 +4191,13 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3243,7 +4211,13 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3257,7 +4231,13 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_acquire_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -3271,7 +4251,13 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_wavefront_acquire_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -3285,7 +4271,13 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3299,7 +4291,13 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3313,7 +4311,13 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3327,7 +4331,13 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3342,7 +4352,13 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg( ; GFX1250-LABEL: global_wavefront_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -3366,6 +4382,12 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -3390,11 +4412,16 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -3404,6 +4431,7 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -3414,7 +4442,13 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3428,7 +4462,13 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3444,6 +4484,12 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -3464,7 +4510,13 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3478,7 +4530,13 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3492,7 +4550,13 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_release_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -3506,7 +4570,13 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_wavefront_release_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -3520,7 +4590,13 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3534,7 +4610,13 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3548,7 +4630,13 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3562,7 +4650,13 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3577,7 +4671,13 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg( ; GFX1250-LABEL: global_wavefront_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -3601,6 +4701,12 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -3625,11 +4731,16 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -3639,6 +4750,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -3649,7 +4761,13 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3663,7 +4781,13 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3679,6 +4803,12 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -3699,7 +4829,13 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3713,7 +4849,13 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3727,7 +4869,13 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -3741,7 +4889,13 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -3755,7 +4909,13 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3769,7 +4929,13 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3783,7 +4949,13 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3797,7 +4969,13 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3812,7 +4990,13 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg( ; GFX1250-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -3836,6 +5020,12 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -3860,11 +5050,16 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -3874,6 +5069,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -3884,7 +5080,13 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3898,7 +5100,13 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3914,6 +5122,12 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -3934,7 +5148,13 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3948,7 +5168,13 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3962,7 +5188,13 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -3976,7 +5208,13 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -3990,7 +5228,13 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4004,7 +5248,13 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4018,7 +5268,13 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4032,7 +5288,13 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4047,7 +5309,13 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg( ; GFX1250-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -4071,6 +5339,12 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -4095,11 +5369,16 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -4109,6 +5388,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -4119,7 +5399,13 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4133,7 +5419,13 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4149,6 +5441,12 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -4169,7 +5467,13 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4183,7 +5487,13 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4197,7 +5507,13 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_monotonic_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -4211,7 +5527,13 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_wavefront_monotonic_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -4225,7 +5547,13 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4239,7 +5567,13 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4253,7 +5587,13 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4267,7 +5607,13 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4282,7 +5628,13 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_cmpxchg( ; GFX1250-LABEL: global_wavefront_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -4306,6 +5658,12 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -4330,11 +5688,16 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -4344,6 +5707,7 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -4354,7 +5718,13 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4368,7 +5738,13 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4384,6 +5760,12 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -4404,7 +5786,13 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4418,7 +5806,13 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4432,7 +5826,13 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_acquire_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -4446,7 +5846,13 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_wavefront_acquire_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -4460,7 +5866,13 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4474,7 +5886,13 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4488,7 +5906,13 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4502,7 +5926,13 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4517,7 +5947,13 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg( ; GFX1250-LABEL: global_wavefront_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -4541,6 +5977,12 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -4565,11 +6007,16 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -4579,6 +6026,7 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -4589,7 +6037,13 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4603,7 +6057,13 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4619,6 +6079,12 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -4639,7 +6105,13 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4653,7 +6125,13 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4667,7 +6145,13 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_release_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -4681,7 +6165,13 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_wavefront_release_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -4695,7 +6185,13 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4709,7 +6205,13 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4723,7 +6225,13 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4737,7 +6245,13 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4752,7 +6266,13 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg( ; GFX1250-LABEL: global_wavefront_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -4776,6 +6296,12 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -4800,11 +6326,16 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -4814,6 +6345,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -4824,7 +6356,13 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4838,7 +6376,13 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4854,6 +6398,12 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -4874,7 +6424,13 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4888,7 +6444,13 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4902,7 +6464,13 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -4916,7 +6484,13 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_wavefront_acq_rel_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -4930,7 +6504,13 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4944,7 +6524,13 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4958,7 +6544,13 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4972,7 +6564,13 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4987,7 +6585,13 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg( ; GFX1250-LABEL: global_wavefront_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -5011,6 +6615,12 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5035,11 +6645,16 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -5049,6 +6664,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -5059,7 +6675,13 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5073,7 +6695,13 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5089,6 +6717,12 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -5109,7 +6743,13 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5123,7 +6763,13 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5137,7 +6783,13 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -5151,7 +6803,13 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_wavefront_seq_cst_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -5165,7 +6823,13 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5179,7 +6843,13 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5193,7 +6863,13 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5207,7 +6883,13 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5222,7 +6904,13 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg( ; GFX1250-LABEL: global_wavefront_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -5246,6 +6934,12 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5270,11 +6964,16 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -5284,6 +6983,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -5294,7 +6994,13 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5308,7 +7014,13 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5324,6 +7036,12 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -5344,7 +7062,13 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5358,7 +7082,13 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5372,7 +7102,13 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -5386,7 +7122,13 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -5400,7 +7142,13 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5414,7 +7162,13 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5428,7 +7182,13 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5442,7 +7202,13 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5457,7 +7223,13 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg( ; GFX1250-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -5481,6 +7253,12 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5505,11 +7283,16 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -5519,6 +7302,7 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -5529,7 +7313,13 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5543,7 +7333,13 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5559,6 +7355,12 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -5579,7 +7381,13 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5593,7 +7401,13 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5607,7 +7421,13 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_acquire_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -5621,7 +7441,13 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_wavefront_acquire_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -5635,7 +7461,13 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5649,7 +7481,13 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5663,7 +7501,13 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5677,7 +7521,13 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5692,7 +7542,13 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg( ; GFX1250-LABEL: global_wavefront_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -5716,6 +7572,12 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5740,11 +7602,16 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -5754,6 +7621,7 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -5764,7 +7632,13 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5778,7 +7652,13 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5794,6 +7674,12 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -5814,7 +7700,13 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5828,7 +7720,13 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5842,7 +7740,13 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_release_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -5856,7 +7760,13 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_wavefront_release_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -5870,7 +7780,13 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5884,7 +7800,13 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5898,7 +7820,13 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5912,7 +7840,13 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5927,7 +7861,13 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg( ; GFX1250-LABEL: global_wavefront_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -5951,6 +7891,12 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5975,11 +7921,16 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -5989,6 +7940,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -5999,7 +7951,13 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6013,7 +7971,13 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6029,6 +7993,12 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -6049,7 +8019,13 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6063,7 +8039,13 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6077,7 +8059,13 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -6091,7 +8079,13 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -6105,7 +8099,13 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6119,7 +8119,13 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6133,7 +8139,13 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6147,7 +8159,13 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6162,7 +8180,13 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX1250-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -6186,6 +8210,12 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6210,11 +8240,16 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -6224,6 +8259,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -6234,7 +8270,13 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6248,7 +8290,13 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6264,6 +8312,12 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -6284,7 +8338,13 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6298,7 +8358,13 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6312,7 +8378,13 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -6326,7 +8398,13 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -6340,7 +8418,13 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6354,7 +8438,13 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6368,7 +8458,13 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6382,7 +8478,13 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6397,7 +8499,13 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX1250-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -6421,6 +8529,12 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6449,6 +8563,12 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -6476,7 +8596,13 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6492,7 +8618,13 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6510,6 +8642,12 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -6533,7 +8671,13 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6549,7 +8693,13 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6565,7 +8715,13 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -6581,7 +8737,13 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -6597,7 +8759,13 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6613,7 +8781,13 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6629,7 +8803,13 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6645,7 +8825,13 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6662,7 +8848,13 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_ret_cmpxchg( ; GFX1250-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -6690,6 +8882,12 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6718,6 +8916,12 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -6745,7 +8949,13 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6761,7 +8971,13 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6779,6 +8995,12 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -6802,7 +9024,13 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6818,7 +9046,13 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6834,7 +9068,13 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -6850,7 +9090,13 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -6866,7 +9112,13 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6882,7 +9134,13 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6898,7 +9156,13 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6914,7 +9178,13 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6931,7 +9201,13 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX1250-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -6959,6 +9235,12 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6987,6 +9269,12 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -7014,7 +9302,13 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7030,7 +9324,13 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7048,6 +9348,12 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -7071,7 +9377,13 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7087,7 +9399,13 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7103,7 +9421,13 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_release_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -7119,7 +9443,13 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_wavefront_release_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -7135,7 +9465,13 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_release_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7151,7 +9487,13 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_release_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7167,7 +9509,13 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7183,7 +9531,13 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7200,7 +9554,13 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg( ; GFX1250-LABEL: global_wavefront_release_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -7228,6 +9588,12 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7256,6 +9622,12 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -7283,7 +9655,13 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7299,7 +9677,13 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7317,6 +9701,12 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -7340,7 +9730,13 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7356,7 +9752,13 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7372,7 +9774,13 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -7388,7 +9796,13 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -7404,7 +9818,13 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7420,7 +9840,13 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7436,7 +9862,13 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7452,7 +9884,13 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7469,7 +9907,13 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX1250-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -7497,6 +9941,12 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7525,6 +9975,12 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -7552,7 +10008,13 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7568,7 +10030,13 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7586,6 +10054,12 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -7609,7 +10083,13 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7625,7 +10105,13 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7641,7 +10127,13 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -7657,7 +10149,13 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -7673,7 +10171,13 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7689,7 +10193,13 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7705,7 +10215,13 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7721,7 +10237,13 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7738,7 +10260,13 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX1250-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -7766,6 +10294,12 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7794,6 +10328,12 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -7821,7 +10361,13 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7837,7 +10383,13 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7855,6 +10407,12 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -7878,7 +10436,13 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7894,7 +10458,13 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7910,7 +10480,13 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -7926,7 +10502,13 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -7942,7 +10524,13 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7958,7 +10546,13 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7974,7 +10568,13 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7990,7 +10590,13 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8007,7 +10613,13 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX1250-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -8035,6 +10647,12 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -8063,6 +10681,12 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -8090,7 +10714,13 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8106,7 +10736,13 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8124,6 +10760,12 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -8147,7 +10789,13 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8163,7 +10811,13 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8179,7 +10833,13 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -8195,7 +10855,13 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -8211,7 +10877,13 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8227,7 +10899,13 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8243,7 +10921,13 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8259,7 +10943,13 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8276,7 +10966,13 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_ret_cmpxchg( ; GFX1250-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -8304,6 +11000,12 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -8332,6 +11034,12 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -8359,7 +11067,13 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8375,7 +11089,13 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8393,6 +11113,12 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -8416,7 +11142,13 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8432,7 +11164,13 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8448,7 +11186,13 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_release_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -8464,7 +11208,13 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_wavefront_release_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -8480,7 +11230,13 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8496,7 +11252,13 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8512,7 +11274,13 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8528,7 +11296,13 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8545,7 +11319,13 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg( ; GFX1250-LABEL: global_wavefront_release_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -8573,6 +11353,12 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -8601,6 +11387,12 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -8628,7 +11420,13 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8644,7 +11442,13 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8662,6 +11466,12 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -8685,7 +11495,13 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8701,7 +11517,13 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8717,7 +11539,13 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -8733,7 +11561,13 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -8749,7 +11583,13 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8765,7 +11605,13 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8781,7 +11627,13 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8797,7 +11649,13 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8814,7 +11672,13 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX1250-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -8842,6 +11706,12 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -8870,6 +11740,12 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -8897,7 +11773,13 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8913,7 +11795,13 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8931,6 +11819,12 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -8954,7 +11848,13 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8970,7 +11870,13 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8986,7 +11892,13 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -9002,7 +11914,13 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -9018,7 +11936,13 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9034,7 +11958,13 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9050,7 +11980,13 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9066,7 +12002,13 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9083,7 +12025,13 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX1250-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -9111,6 +12059,12 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -9139,6 +12093,12 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -9166,7 +12126,13 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9182,7 +12148,13 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9200,6 +12172,12 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -9223,7 +12201,13 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9239,7 +12223,13 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9255,7 +12245,13 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -9271,7 +12267,13 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -9287,7 +12289,13 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9303,7 +12311,13 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9319,7 +12333,13 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9335,7 +12355,13 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9352,7 +12378,13 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -9380,6 +12412,12 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -9408,6 +12446,12 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -9435,7 +12479,13 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9451,7 +12501,13 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9469,6 +12525,12 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -9492,7 +12554,13 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9508,7 +12576,13 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9524,7 +12598,13 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -9540,7 +12620,13 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -9556,7 +12642,13 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9572,7 +12664,13 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9588,7 +12686,13 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9604,7 +12708,13 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9621,7 +12731,13 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -9649,6 +12765,12 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -9677,6 +12799,12 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -9704,7 +12832,13 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9720,7 +12854,13 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9738,6 +12878,12 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -9761,7 +12907,13 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9777,7 +12929,13 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9793,7 +12951,13 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -9809,7 +12973,13 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -9825,7 +12995,13 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9841,7 +13017,13 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9857,7 +13039,13 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9873,7 +13061,13 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9890,7 +13084,13 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -9918,6 +13118,12 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -9946,6 +13152,12 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -9973,7 +13185,13 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9989,7 +13207,13 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10007,6 +13231,12 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -10030,7 +13260,13 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10046,7 +13282,13 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10062,7 +13304,13 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -10078,7 +13326,13 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -10094,7 +13348,13 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10110,7 +13370,13 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10126,7 +13392,13 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10142,7 +13414,13 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10159,7 +13437,13 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -10187,6 +13471,12 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -10215,6 +13505,12 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -10242,7 +13538,13 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10258,7 +13560,13 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10276,6 +13584,12 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -10299,7 +13613,13 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10315,7 +13635,13 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10331,7 +13657,13 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -10347,7 +13679,13 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -10363,7 +13701,13 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10379,7 +13723,13 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10395,7 +13745,13 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10411,7 +13767,13 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10428,7 +13790,13 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -10455,6 +13823,9 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_load( ; GFX6-LABEL: global_wavefront_one_as_unordered_load: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -10482,12 +13853,16 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -10496,29 +13871,38 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_load( ; ; GFX10-WGP-LABEL: global_wavefront_one_as_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_unordered_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -10543,101 +13927,131 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_one_as_unordered_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_wavefront_one_as_unordered_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_one_as_unordered_load: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_one_as_unordered_load: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_wavefront_one_as_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_wavefront_one_as_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_wavefront_one_as_unordered_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -10651,6 +14065,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_load( ; GFX6-LABEL: global_wavefront_one_as_monotonic_load: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -10678,12 +14095,16 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -10692,29 +14113,38 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_load( ; ; GFX10-WGP-LABEL: global_wavefront_one_as_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_monotonic_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -10739,101 +14169,131 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_one_as_monotonic_load: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_one_as_monotonic_load: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_wavefront_one_as_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_wavefront_one_as_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_wavefront_one_as_monotonic_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -10847,6 +14307,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_load( ; GFX6-LABEL: global_wavefront_one_as_acquire_load: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -10874,12 +14337,16 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -10888,29 +14355,38 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_load( ; ; GFX10-WGP-LABEL: global_wavefront_one_as_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acquire_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -10935,101 +14411,131 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_wavefront_one_as_acquire_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_one_as_acquire_load: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_one_as_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_wavefront_one_as_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_wavefront_one_as_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_wavefront_one_as_acquire_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -11043,6 +14549,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_load( ; GFX6-LABEL: global_wavefront_one_as_seq_cst_load: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -11070,12 +14579,16 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -11084,29 +14597,38 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_load( ; ; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_seq_cst_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -11131,101 +14653,131 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_one_as_seq_cst_load: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_one_as_seq_cst_load: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_wavefront_one_as_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_wavefront_one_as_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_wavefront_one_as_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -11239,6 +14791,9 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_store( ; GFX6-LABEL: global_wavefront_one_as_unordered_store: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -11260,6 +14815,10 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -11270,27 +14829,38 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_store( ; ; GFX10-WGP-LABEL: global_wavefront_one_as_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_unordered_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -11308,93 +14878,129 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_store( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_one_as_unordered_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_wavefront_one_as_unordered_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_one_as_unordered_store: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_one_as_unordered_store: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_wavefront_one_as_unordered_store: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_wavefront_one_as_unordered_store: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_wavefront_one_as_unordered_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -11407,6 +15013,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_store( ; GFX6-LABEL: global_wavefront_one_as_monotonic_store: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -11428,6 +15037,10 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -11438,27 +15051,38 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_store( ; ; GFX10-WGP-LABEL: global_wavefront_one_as_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_monotonic_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -11476,93 +15100,129 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_store( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_one_as_monotonic_store: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_one_as_monotonic_store: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_wavefront_one_as_monotonic_store: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_wavefront_one_as_monotonic_store: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_wavefront_one_as_monotonic_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -11575,6 +15235,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_store( ; GFX6-LABEL: global_wavefront_one_as_release_store: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -11596,6 +15259,10 @@ define amdgpu_kernel void @global_wavefront_one_as_release_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -11606,27 +15273,38 @@ define amdgpu_kernel void @global_wavefront_one_as_release_store( ; ; GFX10-WGP-LABEL: global_wavefront_one_as_release_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_release_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_release_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -11644,93 +15322,129 @@ define amdgpu_kernel void @global_wavefront_one_as_release_store( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_wavefront_one_as_release_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_one_as_release_store: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_one_as_release_store: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_wavefront_one_as_release_store: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_wavefront_one_as_release_store: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_wavefront_one_as_release_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -11743,6 +15457,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_store( ; GFX6-LABEL: global_wavefront_one_as_seq_cst_store: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -11764,6 +15481,10 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -11774,27 +15495,38 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_store( ; ; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_seq_cst_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -11812,93 +15544,129 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_store( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_one_as_seq_cst_store: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_one_as_seq_cst_store: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_wavefront_one_as_seq_cst_store: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_wavefront_one_as_seq_cst_store: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_wavefront_one_as_seq_cst_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -11911,6 +15679,10 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_atomicrmw( ; GFX6-LABEL: global_wavefront_one_as_monotonic_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -11930,18 +15702,26 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_wavefront_one_as_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11951,7 +15731,11 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_atomicrmw( ; ; GFX10-CU-LABEL: global_wavefront_one_as_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11962,6 +15746,10 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_monotonic_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -11978,7 +15766,11 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11988,7 +15780,11 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11998,7 +15794,11 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12008,7 +15808,11 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12018,7 +15822,11 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_atomicrmw( ; ; GFX11-WGP-LABEL: global_wavefront_one_as_monotonic_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12028,7 +15836,11 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_atomicrmw( ; ; GFX11-CU-LABEL: global_wavefront_one_as_monotonic_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12038,7 +15850,11 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_atomicrmw( ; ; GFX12-WGP-LABEL: global_wavefront_one_as_monotonic_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -12048,7 +15864,11 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_atomicrmw( ; ; GFX12-CU-LABEL: global_wavefront_one_as_monotonic_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -12059,7 +15879,11 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_atomicrmw( ; GFX1250-LABEL: global_wavefront_one_as_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -12077,6 +15901,10 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_atomicrmw( ; GFX6-LABEL: global_wavefront_one_as_acquire_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -12096,18 +15924,26 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_wavefront_one_as_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12117,7 +15953,11 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_atomicrmw( ; ; GFX10-CU-LABEL: global_wavefront_one_as_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12128,6 +15968,10 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acquire_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -12144,7 +15988,11 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12154,7 +16002,11 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12164,7 +16016,11 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12174,7 +16030,11 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_wavefront_one_as_acquire_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12184,7 +16044,11 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_atomicrmw( ; ; GFX11-WGP-LABEL: global_wavefront_one_as_acquire_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12194,7 +16058,11 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_atomicrmw( ; ; GFX11-CU-LABEL: global_wavefront_one_as_acquire_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12204,7 +16072,11 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_atomicrmw( ; ; GFX12-WGP-LABEL: global_wavefront_one_as_acquire_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -12214,7 +16086,11 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_atomicrmw( ; ; GFX12-CU-LABEL: global_wavefront_one_as_acquire_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -12225,7 +16101,11 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_atomicrmw( ; GFX1250-LABEL: global_wavefront_one_as_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -12243,6 +16123,10 @@ define amdgpu_kernel void @global_wavefront_one_as_release_atomicrmw( ; GFX6-LABEL: global_wavefront_one_as_release_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -12262,18 +16146,26 @@ define amdgpu_kernel void @global_wavefront_one_as_release_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_wavefront_one_as_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12283,7 +16175,11 @@ define amdgpu_kernel void @global_wavefront_one_as_release_atomicrmw( ; ; GFX10-CU-LABEL: global_wavefront_one_as_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12294,6 +16190,10 @@ define amdgpu_kernel void @global_wavefront_one_as_release_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_release_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -12310,7 +16210,11 @@ define amdgpu_kernel void @global_wavefront_one_as_release_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12320,7 +16224,11 @@ define amdgpu_kernel void @global_wavefront_one_as_release_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12330,7 +16238,11 @@ define amdgpu_kernel void @global_wavefront_one_as_release_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12340,7 +16252,11 @@ define amdgpu_kernel void @global_wavefront_one_as_release_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_wavefront_one_as_release_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12350,7 +16266,11 @@ define amdgpu_kernel void @global_wavefront_one_as_release_atomicrmw( ; ; GFX11-WGP-LABEL: global_wavefront_one_as_release_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12360,7 +16280,11 @@ define amdgpu_kernel void @global_wavefront_one_as_release_atomicrmw( ; ; GFX11-CU-LABEL: global_wavefront_one_as_release_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12370,7 +16294,11 @@ define amdgpu_kernel void @global_wavefront_one_as_release_atomicrmw( ; ; GFX12-WGP-LABEL: global_wavefront_one_as_release_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -12380,7 +16308,11 @@ define amdgpu_kernel void @global_wavefront_one_as_release_atomicrmw( ; ; GFX12-CU-LABEL: global_wavefront_one_as_release_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -12391,7 +16323,11 @@ define amdgpu_kernel void @global_wavefront_one_as_release_atomicrmw( ; GFX1250-LABEL: global_wavefront_one_as_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -12409,6 +16345,10 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_atomicrmw( ; GFX6-LABEL: global_wavefront_one_as_acq_rel_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -12428,18 +16368,26 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_wavefront_one_as_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12449,7 +16397,11 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_atomicrmw( ; ; GFX10-CU-LABEL: global_wavefront_one_as_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12460,6 +16412,10 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acq_rel_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -12476,7 +16432,11 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12486,7 +16446,11 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12496,7 +16460,11 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12506,7 +16474,11 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12516,7 +16488,11 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_atomicrmw( ; ; GFX11-WGP-LABEL: global_wavefront_one_as_acq_rel_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12526,7 +16502,11 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_atomicrmw( ; ; GFX11-CU-LABEL: global_wavefront_one_as_acq_rel_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12536,7 +16516,11 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_atomicrmw( ; ; GFX12-WGP-LABEL: global_wavefront_one_as_acq_rel_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -12546,7 +16530,11 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_atomicrmw( ; ; GFX12-CU-LABEL: global_wavefront_one_as_acq_rel_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -12557,7 +16545,11 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_atomicrmw( ; GFX1250-LABEL: global_wavefront_one_as_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -12575,6 +16567,10 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_atomicrmw( ; GFX6-LABEL: global_wavefront_one_as_seq_cst_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -12594,18 +16590,26 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12615,7 +16619,11 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_atomicrmw( ; ; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12626,6 +16634,10 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_seq_cst_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -12642,7 +16654,11 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12652,7 +16668,11 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12662,7 +16682,11 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12672,7 +16696,11 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12682,7 +16710,11 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_atomicrmw( ; ; GFX11-WGP-LABEL: global_wavefront_one_as_seq_cst_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12692,7 +16724,11 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_atomicrmw( ; ; GFX11-CU-LABEL: global_wavefront_one_as_seq_cst_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12702,7 +16738,11 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_atomicrmw( ; ; GFX12-WGP-LABEL: global_wavefront_one_as_seq_cst_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -12712,7 +16752,11 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_atomicrmw( ; ; GFX12-CU-LABEL: global_wavefront_one_as_seq_cst_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -12723,7 +16767,11 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_atomicrmw( ; GFX1250-LABEL: global_wavefront_one_as_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -12741,6 +16789,10 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_ret_atomicrmw( ; GFX6-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -12763,6 +16815,10 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_ret_atomicrmw( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -12777,7 +16833,11 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_ret_atomicrmw( ; ; GFX10-WGP-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12789,7 +16849,11 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_ret_atomicrmw( ; ; GFX10-CU-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12802,6 +16866,10 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -12820,7 +16888,11 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12832,7 +16904,11 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12844,7 +16920,11 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_ret_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12856,7 +16936,11 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_ret_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12868,7 +16952,11 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_ret_atomicrmw( ; ; GFX11-WGP-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12880,7 +16968,11 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_ret_atomicrmw( ; ; GFX11-CU-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12892,7 +16984,11 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_ret_atomicrmw( ; ; GFX12-WGP-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -12904,7 +17000,11 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_ret_atomicrmw( ; ; GFX12-CU-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -12917,7 +17017,11 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_ret_atomicrmw( ; GFX1250-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -12938,6 +17042,10 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX6-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -12960,6 +17068,10 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -12974,7 +17086,11 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_ret_atomicrmw( ; ; GFX10-WGP-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12986,7 +17102,11 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_ret_atomicrmw( ; ; GFX10-CU-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12999,6 +17119,10 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -13017,7 +17141,11 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13029,7 +17157,11 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13041,7 +17173,11 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_ret_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13053,7 +17189,11 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_ret_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13065,7 +17205,11 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_ret_atomicrmw( ; ; GFX11-WGP-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -13077,7 +17221,11 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_ret_atomicrmw( ; ; GFX11-CU-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -13089,7 +17237,11 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_ret_atomicrmw( ; ; GFX12-WGP-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -13101,7 +17253,11 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_ret_atomicrmw( ; ; GFX12-CU-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -13114,7 +17270,11 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX1250-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -13135,6 +17295,10 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX6-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -13157,6 +17321,10 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -13171,7 +17339,11 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_ret_atomicrmw( ; ; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -13183,7 +17355,11 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_ret_atomicrmw( ; ; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -13196,6 +17372,10 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -13214,7 +17394,11 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13226,7 +17410,11 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13238,7 +17426,11 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_ret_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13250,7 +17442,11 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_ret_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13262,7 +17458,11 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_ret_atomicrmw( ; ; GFX11-WGP-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -13274,7 +17474,11 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_ret_atomicrmw( ; ; GFX11-CU-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -13286,7 +17490,11 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_ret_atomicrmw( ; ; GFX12-WGP-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -13298,7 +17506,11 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_ret_atomicrmw( ; ; GFX12-CU-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -13311,7 +17523,11 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX1250-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -13333,6 +17549,12 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -13357,11 +17579,16 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -13371,6 +17598,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -13381,7 +17609,13 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13395,7 +17629,13 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13411,6 +17651,12 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -13431,7 +17677,13 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13445,7 +17697,13 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13459,7 +17717,13 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -13473,7 +17737,13 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -13487,7 +17757,13 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13501,7 +17777,13 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13515,7 +17797,13 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13529,7 +17817,13 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13544,7 +17838,13 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_cmpxchg( ; GFX1250-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -13568,6 +17868,12 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -13592,11 +17898,16 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -13606,6 +17917,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -13616,7 +17928,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13630,7 +17948,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13646,6 +17970,12 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -13666,7 +17996,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13680,7 +18016,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13694,7 +18036,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -13708,7 +18056,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -13722,7 +18076,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13736,7 +18096,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13750,7 +18116,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13764,7 +18136,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13779,7 +18157,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX1250-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -13803,6 +18187,12 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -13827,11 +18217,16 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -13841,6 +18236,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -13851,7 +18247,13 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13865,7 +18267,13 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13881,6 +18289,12 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -13901,7 +18315,13 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13915,7 +18335,13 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13929,7 +18355,13 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -13943,7 +18375,13 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -13957,7 +18395,13 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13971,7 +18415,13 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13985,7 +18435,13 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13999,7 +18455,13 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14014,7 +18476,13 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg( ; GFX1250-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -14038,6 +18506,12 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -14062,11 +18536,16 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -14076,6 +18555,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -14086,7 +18566,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14100,7 +18586,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14116,6 +18608,12 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -14136,7 +18634,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14150,7 +18654,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14164,7 +18674,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -14178,7 +18694,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -14192,7 +18714,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14206,7 +18734,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14220,7 +18754,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14234,7 +18774,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14249,7 +18795,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX1250-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -14273,6 +18825,12 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -14297,11 +18855,16 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -14311,6 +18874,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -14321,7 +18885,13 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14335,7 +18905,13 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14351,6 +18927,12 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -14371,7 +18953,13 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14385,7 +18973,13 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14399,7 +18993,13 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -14413,7 +19013,13 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -14427,7 +19033,13 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14441,7 +19053,13 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14455,7 +19073,13 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14469,7 +19093,13 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14484,7 +19114,13 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX1250-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -14508,6 +19144,12 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -14532,11 +19174,16 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -14546,6 +19193,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -14556,7 +19204,13 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14570,7 +19224,13 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14586,6 +19246,12 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -14606,7 +19272,13 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14620,7 +19292,13 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14634,7 +19312,13 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -14648,7 +19332,13 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -14662,7 +19352,13 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14676,7 +19372,13 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14690,7 +19392,13 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14704,7 +19412,13 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14719,7 +19433,13 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX1250-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -14743,6 +19463,12 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -14767,11 +19493,16 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -14781,6 +19512,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -14791,7 +19523,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14805,7 +19543,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14821,6 +19565,12 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -14841,7 +19591,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14855,7 +19611,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14869,7 +19631,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -14883,7 +19651,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -14897,7 +19671,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14911,7 +19691,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14925,7 +19711,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14939,7 +19731,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14954,7 +19752,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX1250-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -14978,6 +19782,12 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15002,11 +19812,16 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -15016,6 +19831,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -15026,7 +19842,13 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15040,7 +19862,13 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15056,6 +19884,12 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -15076,7 +19910,13 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15090,7 +19930,13 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15104,7 +19950,13 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -15118,7 +19970,13 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -15132,7 +19990,13 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15146,7 +20010,13 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15160,7 +20030,13 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15174,7 +20050,13 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15189,7 +20071,13 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg( ; GFX1250-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -15213,6 +20101,12 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15237,11 +20131,16 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -15251,6 +20150,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -15261,7 +20161,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15275,7 +20181,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15291,6 +20203,12 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -15311,7 +20229,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15325,7 +20249,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15339,7 +20269,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -15353,7 +20289,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -15367,7 +20309,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15381,7 +20329,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15395,7 +20349,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15409,7 +20369,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15424,7 +20390,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX1250-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -15448,6 +20420,12 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15472,11 +20450,16 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -15486,6 +20469,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -15496,7 +20480,13 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15510,7 +20500,13 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15526,6 +20522,12 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -15546,7 +20548,13 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15560,7 +20568,13 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15574,7 +20588,13 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -15588,7 +20608,13 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -15602,7 +20628,13 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15616,7 +20648,13 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15630,7 +20668,13 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15644,7 +20688,13 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15659,7 +20709,13 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX1250-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -15683,6 +20739,12 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15707,11 +20769,16 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -15721,6 +20788,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -15731,7 +20799,13 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15745,7 +20819,13 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15761,6 +20841,12 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -15781,7 +20867,13 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15795,7 +20887,13 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15809,7 +20907,13 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -15823,7 +20927,13 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -15837,7 +20947,13 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15851,7 +20967,13 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15865,7 +20987,13 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15879,7 +21007,13 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15894,7 +21028,13 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX1250-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -15918,6 +21058,12 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15942,11 +21088,16 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -15956,6 +21107,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -15966,7 +21118,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15980,7 +21138,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15996,6 +21160,12 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16016,7 +21186,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16030,7 +21206,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16044,7 +21226,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -16058,7 +21246,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -16072,7 +21266,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16086,7 +21286,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16100,7 +21306,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16114,7 +21326,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16129,7 +21347,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX1250-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -16153,6 +21377,12 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -16177,11 +21407,16 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -16191,6 +21426,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -16201,7 +21437,13 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16215,7 +21457,13 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16231,6 +21479,12 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16251,7 +21505,13 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16265,7 +21525,13 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16279,7 +21545,13 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -16293,7 +21565,13 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -16307,7 +21585,13 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16321,7 +21605,13 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16335,7 +21625,13 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16349,7 +21645,13 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16364,7 +21666,13 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX1250-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -16388,6 +21696,12 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -16412,11 +21726,16 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -16426,6 +21745,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -16436,7 +21756,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16450,7 +21776,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16466,6 +21798,12 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16486,7 +21824,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16500,7 +21844,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16514,7 +21864,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -16528,7 +21884,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -16542,7 +21904,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16556,7 +21924,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16570,7 +21944,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16584,7 +21964,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16599,7 +21985,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX1250-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -16623,6 +22015,12 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -16647,11 +22045,16 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -16661,6 +22064,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -16671,7 +22075,13 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16685,7 +22095,13 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16701,6 +22117,12 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16721,7 +22143,13 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16735,7 +22163,13 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16749,7 +22183,13 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -16763,7 +22203,13 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -16777,7 +22223,13 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16791,7 +22243,13 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16805,7 +22263,13 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16819,7 +22283,13 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16834,7 +22304,13 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX1250-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -16858,6 +22334,12 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_ret_cmpxc ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -16886,6 +22368,12 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_ret_cmpxc ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -16913,7 +22401,13 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_ret_cmpxc ; ; GFX10-WGP-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16929,7 +22423,13 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_ret_cmpxc ; ; GFX10-CU-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16947,6 +22447,12 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_ret_cmpxc ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16970,7 +22476,13 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_ret_cmpxc ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16986,7 +22498,13 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_ret_cmpxc ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17002,7 +22520,13 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_ret_cmpxc ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -17018,7 +22542,13 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_ret_cmpxc ; ; GFX942-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -17034,7 +22564,13 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_ret_cmpxc ; ; GFX11-WGP-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17050,7 +22586,13 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_ret_cmpxc ; ; GFX11-CU-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17066,7 +22608,13 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_ret_cmpxc ; ; GFX12-WGP-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17082,7 +22630,13 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_ret_cmpxc ; ; GFX12-CU-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17099,7 +22653,13 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_ret_cmpxc ; GFX1250-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -17127,6 +22687,12 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_ret_cmpxchg ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -17155,6 +22721,12 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_ret_cmpxchg ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -17182,7 +22754,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_ret_cmpxchg ; ; GFX10-WGP-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17198,7 +22776,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_ret_cmpxchg ; ; GFX10-CU-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17216,6 +22800,12 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_ret_cmpxchg ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -17239,7 +22829,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_ret_cmpxchg ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17255,7 +22851,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_ret_cmpxchg ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17271,7 +22873,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_ret_cmpxchg ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -17287,7 +22895,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_ret_cmpxchg ; ; GFX942-TGSPLIT-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -17303,7 +22917,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_ret_cmpxchg ; ; GFX11-WGP-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17319,7 +22939,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_ret_cmpxchg ; ; GFX11-CU-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17335,7 +22961,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_ret_cmpxchg ; ; GFX12-WGP-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17351,7 +22983,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_ret_cmpxchg ; ; GFX12-CU-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17368,7 +23006,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_ret_cmpxchg ; GFX1250-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -17396,6 +23040,12 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_ret_cmpxchg ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -17424,6 +23074,12 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_ret_cmpxchg ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -17451,7 +23107,13 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_ret_cmpxchg ; ; GFX10-WGP-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17467,7 +23129,13 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_ret_cmpxchg ; ; GFX10-CU-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17485,6 +23153,12 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_ret_cmpxchg ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -17508,7 +23182,13 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_ret_cmpxchg ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17524,7 +23204,13 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_ret_cmpxchg ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17540,7 +23226,13 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_ret_cmpxchg ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -17556,7 +23248,13 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_ret_cmpxchg ; ; GFX942-TGSPLIT-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -17572,7 +23270,13 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_ret_cmpxchg ; ; GFX11-WGP-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17588,7 +23292,13 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_ret_cmpxchg ; ; GFX11-CU-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17604,7 +23314,13 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_ret_cmpxchg ; ; GFX12-WGP-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17620,7 +23336,13 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_ret_cmpxchg ; ; GFX12-CU-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17637,7 +23359,13 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_ret_cmpxchg ; GFX1250-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -17665,6 +23393,12 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -17693,6 +23427,12 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -17720,7 +23460,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg ; ; GFX10-WGP-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17736,7 +23482,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg ; ; GFX10-CU-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17754,6 +23506,12 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -17777,7 +23535,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17793,7 +23557,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17809,7 +23579,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -17825,7 +23601,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg ; ; GFX942-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -17841,7 +23623,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg ; ; GFX11-WGP-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17857,7 +23645,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg ; ; GFX11-CU-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17873,7 +23667,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg ; ; GFX12-WGP-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17889,7 +23689,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg ; ; GFX12-CU-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17906,7 +23712,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg ; GFX1250-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -17934,6 +23746,12 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -17962,6 +23780,12 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -17989,7 +23813,13 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg ; ; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18005,7 +23835,13 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg ; ; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18023,6 +23859,12 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -18046,7 +23888,13 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18062,7 +23910,13 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18078,7 +23932,13 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -18094,7 +23954,13 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg ; ; GFX942-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -18110,7 +23976,13 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg ; ; GFX11-WGP-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18126,7 +23998,13 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg ; ; GFX11-CU-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18142,7 +24020,13 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg ; ; GFX12-WGP-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18158,7 +24042,13 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg ; ; GFX12-CU-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18175,7 +24065,13 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg ; GFX1250-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -18203,6 +24099,12 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_ret_cmpxchg ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -18231,6 +24133,12 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_ret_cmpxchg ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -18258,7 +24166,13 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_ret_cmpxchg ; ; GFX10-WGP-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18274,7 +24188,13 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_ret_cmpxchg ; ; GFX10-CU-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18292,6 +24212,12 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_ret_cmpxchg ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -18315,7 +24241,13 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_ret_cmpxchg ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18331,7 +24263,13 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_ret_cmpxchg ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18347,7 +24285,13 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_ret_cmpxchg ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -18363,7 +24307,13 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_ret_cmpxchg ; ; GFX942-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -18379,7 +24329,13 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_ret_cmpxchg ; ; GFX11-WGP-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18395,7 +24351,13 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_ret_cmpxchg ; ; GFX11-CU-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18411,7 +24373,13 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_ret_cmpxchg ; ; GFX12-WGP-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18427,7 +24395,13 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_ret_cmpxchg ; ; GFX12-CU-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18444,7 +24418,13 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_ret_cmpxchg ; GFX1250-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -18472,6 +24452,12 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -18500,6 +24486,12 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -18527,7 +24519,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18543,7 +24541,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18561,6 +24565,12 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -18584,7 +24594,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18600,7 +24616,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18616,7 +24638,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -18632,7 +24660,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -18648,7 +24682,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18664,7 +24704,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18680,7 +24726,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18696,7 +24748,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18713,7 +24771,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX1250-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -18741,6 +24805,12 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -18769,6 +24839,12 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -18796,7 +24872,13 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18812,7 +24894,13 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18830,6 +24918,12 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -18853,7 +24947,13 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18869,7 +24969,13 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18885,7 +24991,13 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -18901,7 +25013,13 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -18917,7 +25035,13 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18933,7 +25057,13 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18949,7 +25079,13 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18965,7 +25101,13 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18982,7 +25124,13 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX1250-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -19010,6 +25158,12 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -19038,6 +25192,12 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -19065,7 +25225,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19081,7 +25247,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19099,6 +25271,12 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -19122,7 +25300,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19138,7 +25322,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19154,7 +25344,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -19170,7 +25366,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -19186,7 +25388,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19202,7 +25410,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19218,7 +25432,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19234,7 +25454,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19251,7 +25477,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX1250-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -19279,6 +25511,12 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -19307,6 +25545,12 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -19334,7 +25578,13 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19350,7 +25600,13 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19368,6 +25624,12 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -19391,7 +25653,13 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19407,7 +25675,13 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19423,7 +25697,13 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -19439,7 +25719,13 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -19455,7 +25741,13 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19471,7 +25763,13 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19487,7 +25785,13 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19503,7 +25807,13 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19520,7 +25830,13 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX1250-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -19548,6 +25864,12 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -19576,6 +25898,12 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -19603,7 +25931,13 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg ; ; GFX10-WGP-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19619,7 +25953,13 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg ; ; GFX10-CU-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19637,6 +25977,12 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -19660,7 +26006,13 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19676,7 +26028,13 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19692,7 +26050,13 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -19708,7 +26072,13 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg ; ; GFX942-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -19724,7 +26094,13 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg ; ; GFX11-WGP-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19740,7 +26116,13 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg ; ; GFX11-CU-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19756,7 +26138,13 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg ; ; GFX12-WGP-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19772,7 +26160,13 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg ; ; GFX12-CU-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19789,7 +26183,13 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg ; GFX1250-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -19817,6 +26217,12 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -19845,6 +26251,12 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -19872,7 +26284,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19888,7 +26306,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19906,6 +26330,12 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -19929,7 +26359,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19945,7 +26381,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19961,7 +26403,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -19977,7 +26425,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -19993,7 +26447,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20009,7 +26469,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20025,7 +26491,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20041,7 +26513,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20058,7 +26536,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -20086,6 +26570,12 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -20114,6 +26604,12 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -20141,7 +26637,13 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20157,7 +26659,13 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20175,6 +26683,12 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -20198,7 +26712,13 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20214,7 +26734,13 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20230,7 +26756,13 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -20246,7 +26778,13 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -20262,7 +26800,13 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20278,7 +26822,13 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20294,7 +26844,13 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20310,7 +26866,13 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20327,7 +26889,13 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -20355,6 +26923,12 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -20383,6 +26957,12 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -20410,7 +26990,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20426,7 +27012,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20444,6 +27036,12 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -20467,7 +27065,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20483,7 +27087,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20499,7 +27109,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -20515,7 +27131,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -20531,7 +27153,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20547,7 +27175,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20563,7 +27197,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20579,7 +27219,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20596,7 +27242,13 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -20624,6 +27276,12 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -20652,6 +27310,12 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -20679,7 +27343,13 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20695,7 +27365,13 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20713,6 +27389,12 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -20736,7 +27418,13 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20752,7 +27440,13 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20768,7 +27462,13 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -20784,7 +27484,13 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -20800,7 +27506,13 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20816,7 +27528,13 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20832,7 +27550,13 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20848,7 +27572,13 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20865,7 +27595,13 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll index c51c1d52b723..6d1732ad4f68 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll @@ -18,6 +18,9 @@ define amdgpu_kernel void @global_workgroup_unordered_load( ; GFX6-LABEL: global_workgroup_unordered_load: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -45,12 +48,16 @@ define amdgpu_kernel void @global_workgroup_unordered_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -59,29 +66,38 @@ define amdgpu_kernel void @global_workgroup_unordered_load( ; ; GFX10-WGP-LABEL: global_workgroup_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_unordered_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -106,101 +122,131 @@ define amdgpu_kernel void @global_workgroup_unordered_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_unordered_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_workgroup_unordered_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_workgroup_unordered_load: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_workgroup_unordered_load: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_workgroup_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_workgroup_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_workgroup_unordered_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -214,6 +260,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_load( ; GFX6-LABEL: global_workgroup_monotonic_load: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -241,12 +290,16 @@ define amdgpu_kernel void @global_workgroup_monotonic_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -255,29 +308,38 @@ define amdgpu_kernel void @global_workgroup_monotonic_load( ; ; GFX10-WGP-LABEL: global_workgroup_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] glc -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_monotonic_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -302,101 +364,131 @@ define amdgpu_kernel void @global_workgroup_monotonic_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_monotonic_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_workgroup_monotonic_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc0 -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_workgroup_monotonic_load: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] glc -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_workgroup_monotonic_load: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_workgroup_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_workgroup_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_workgroup_monotonic_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -410,6 +502,9 @@ define amdgpu_kernel void @global_workgroup_acquire_load( ; GFX6-LABEL: global_workgroup_acquire_load: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -437,12 +532,16 @@ define amdgpu_kernel void @global_workgroup_acquire_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -451,30 +550,40 @@ define amdgpu_kernel void @global_workgroup_acquire_load( ; ; GFX10-WGP-LABEL: global_workgroup_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_acquire_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -499,105 +608,138 @@ define amdgpu_kernel void @global_workgroup_acquire_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_acquire_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_workgroup_acquire_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_workgroup_acquire_load: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_workgroup_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_workgroup_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_workgroup_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_workgroup_acquire_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -611,6 +753,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load( ; GFX6-LABEL: global_workgroup_seq_cst_load: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -639,9 +784,12 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -654,8 +802,12 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load( ; ; GFX10-WGP-LABEL: global_workgroup_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -667,8 +819,12 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load( ; ; GFX10-CU-LABEL: global_workgroup_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -680,6 +836,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load( ; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -705,8 +864,12 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] @@ -716,8 +879,12 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc @@ -728,8 +895,12 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load( ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc0 @@ -739,8 +910,12 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load( ; ; GFX942-TGSPLIT-LABEL: global_workgroup_seq_cst_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc0 @@ -751,8 +926,12 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load( ; ; GFX11-WGP-LABEL: global_workgroup_seq_cst_load: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -764,8 +943,12 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load( ; ; GFX11-CU-LABEL: global_workgroup_seq_cst_load: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -776,46 +959,58 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load( ; ; GFX12-WGP-LABEL: global_workgroup_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_workgroup_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_workgroup_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -829,6 +1024,9 @@ define amdgpu_kernel void @global_workgroup_unordered_store( ; GFX6-LABEL: global_workgroup_unordered_store: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -850,6 +1048,10 @@ define amdgpu_kernel void @global_workgroup_unordered_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -860,27 +1062,38 @@ define amdgpu_kernel void @global_workgroup_unordered_store( ; ; GFX10-WGP-LABEL: global_workgroup_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_unordered_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -898,93 +1111,129 @@ define amdgpu_kernel void @global_workgroup_unordered_store( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_unordered_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_workgroup_unordered_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_workgroup_unordered_store: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_workgroup_unordered_store: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_workgroup_unordered_store: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_workgroup_unordered_store: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_workgroup_unordered_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -997,6 +1246,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_store( ; GFX6-LABEL: global_workgroup_monotonic_store: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -1018,6 +1270,10 @@ define amdgpu_kernel void @global_workgroup_monotonic_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -1028,27 +1284,38 @@ define amdgpu_kernel void @global_workgroup_monotonic_store( ; ; GFX10-WGP-LABEL: global_workgroup_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_monotonic_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -1066,93 +1333,129 @@ define amdgpu_kernel void @global_workgroup_monotonic_store( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_monotonic_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_workgroup_monotonic_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_workgroup_monotonic_store: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_workgroup_monotonic_store: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_workgroup_monotonic_store: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_workgroup_monotonic_store: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_workgroup_monotonic_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -1165,6 +1468,9 @@ define amdgpu_kernel void @global_workgroup_release_store( ; GFX6-LABEL: global_workgroup_release_store: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -1187,6 +1493,10 @@ define amdgpu_kernel void @global_workgroup_release_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -1198,10 +1508,13 @@ define amdgpu_kernel void @global_workgroup_release_store( ; ; GFX10-WGP-LABEL: global_workgroup_release_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1210,10 +1523,13 @@ define amdgpu_kernel void @global_workgroup_release_store( ; ; GFX10-CU-LABEL: global_workgroup_release_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1223,6 +1539,9 @@ define amdgpu_kernel void @global_workgroup_release_store( ; SKIP-CACHE-INV-LABEL: global_workgroup_release_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -1241,10 +1560,13 @@ define amdgpu_kernel void @global_workgroup_release_store( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -1252,10 +1574,13 @@ define amdgpu_kernel void @global_workgroup_release_store( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -1263,10 +1588,13 @@ define amdgpu_kernel void @global_workgroup_release_store( ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_release_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc0 @@ -1274,10 +1602,13 @@ define amdgpu_kernel void @global_workgroup_release_store( ; ; GFX942-TGSPLIT-LABEL: global_workgroup_release_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc0 @@ -1285,10 +1616,13 @@ define amdgpu_kernel void @global_workgroup_release_store( ; ; GFX11-WGP-LABEL: global_workgroup_release_store: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1297,10 +1631,13 @@ define amdgpu_kernel void @global_workgroup_release_store( ; ; GFX11-CU-LABEL: global_workgroup_release_store: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1309,43 +1646,55 @@ define amdgpu_kernel void @global_workgroup_release_store( ; ; GFX12-WGP-LABEL: global_workgroup_release_store: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_workgroup_release_store: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_workgroup_release_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -1358,6 +1707,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store( ; GFX6-LABEL: global_workgroup_seq_cst_store: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -1380,6 +1732,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -1391,10 +1747,13 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store( ; ; GFX10-WGP-LABEL: global_workgroup_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1403,10 +1762,13 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store( ; ; GFX10-CU-LABEL: global_workgroup_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1416,6 +1778,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store( ; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -1434,10 +1799,13 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -1445,10 +1813,13 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -1456,10 +1827,13 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store( ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc0 @@ -1467,10 +1841,13 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store( ; ; GFX942-TGSPLIT-LABEL: global_workgroup_seq_cst_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc0 @@ -1478,10 +1855,13 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store( ; ; GFX11-WGP-LABEL: global_workgroup_seq_cst_store: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1490,10 +1870,13 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store( ; ; GFX11-CU-LABEL: global_workgroup_seq_cst_store: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1502,43 +1885,55 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store( ; ; GFX12-WGP-LABEL: global_workgroup_seq_cst_store: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_workgroup_seq_cst_store: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_workgroup_seq_cst_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -1551,6 +1946,10 @@ define amdgpu_kernel void @global_workgroup_monotonic_atomicrmw( ; GFX6-LABEL: global_workgroup_monotonic_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -1570,18 +1969,26 @@ define amdgpu_kernel void @global_workgroup_monotonic_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_workgroup_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1591,7 +1998,11 @@ define amdgpu_kernel void @global_workgroup_monotonic_atomicrmw( ; ; GFX10-CU-LABEL: global_workgroup_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1602,6 +2013,10 @@ define amdgpu_kernel void @global_workgroup_monotonic_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_workgroup_monotonic_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -1618,7 +2033,11 @@ define amdgpu_kernel void @global_workgroup_monotonic_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1628,7 +2047,11 @@ define amdgpu_kernel void @global_workgroup_monotonic_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1638,7 +2061,11 @@ define amdgpu_kernel void @global_workgroup_monotonic_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_monotonic_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1648,7 +2075,11 @@ define amdgpu_kernel void @global_workgroup_monotonic_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_workgroup_monotonic_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1658,7 +2089,11 @@ define amdgpu_kernel void @global_workgroup_monotonic_atomicrmw( ; ; GFX11-WGP-LABEL: global_workgroup_monotonic_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1668,7 +2103,11 @@ define amdgpu_kernel void @global_workgroup_monotonic_atomicrmw( ; ; GFX11-CU-LABEL: global_workgroup_monotonic_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1678,7 +2117,11 @@ define amdgpu_kernel void @global_workgroup_monotonic_atomicrmw( ; ; GFX12-WGP-LABEL: global_workgroup_monotonic_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -1688,7 +2131,11 @@ define amdgpu_kernel void @global_workgroup_monotonic_atomicrmw( ; ; GFX12-CU-LABEL: global_workgroup_monotonic_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -1699,7 +2146,11 @@ define amdgpu_kernel void @global_workgroup_monotonic_atomicrmw( ; GFX1250-LABEL: global_workgroup_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -1717,6 +2168,10 @@ define amdgpu_kernel void @global_workgroup_acquire_atomicrmw( ; GFX6-LABEL: global_workgroup_acquire_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -1736,18 +2191,26 @@ define amdgpu_kernel void @global_workgroup_acquire_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_workgroup_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1759,7 +2222,11 @@ define amdgpu_kernel void @global_workgroup_acquire_atomicrmw( ; ; GFX10-CU-LABEL: global_workgroup_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1770,6 +2237,10 @@ define amdgpu_kernel void @global_workgroup_acquire_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_workgroup_acquire_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -1786,7 +2257,11 @@ define amdgpu_kernel void @global_workgroup_acquire_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1796,7 +2271,11 @@ define amdgpu_kernel void @global_workgroup_acquire_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1808,7 +2287,11 @@ define amdgpu_kernel void @global_workgroup_acquire_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_acquire_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1818,7 +2301,11 @@ define amdgpu_kernel void @global_workgroup_acquire_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_workgroup_acquire_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1830,7 +2317,11 @@ define amdgpu_kernel void @global_workgroup_acquire_atomicrmw( ; ; GFX11-WGP-LABEL: global_workgroup_acquire_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1842,7 +2333,11 @@ define amdgpu_kernel void @global_workgroup_acquire_atomicrmw( ; ; GFX11-CU-LABEL: global_workgroup_acquire_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1852,7 +2347,11 @@ define amdgpu_kernel void @global_workgroup_acquire_atomicrmw( ; ; GFX12-WGP-LABEL: global_workgroup_acquire_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -1864,7 +2363,11 @@ define amdgpu_kernel void @global_workgroup_acquire_atomicrmw( ; ; GFX12-CU-LABEL: global_workgroup_acquire_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -1875,7 +2378,11 @@ define amdgpu_kernel void @global_workgroup_acquire_atomicrmw( ; GFX1250-LABEL: global_workgroup_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -1894,6 +2401,10 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw( ; GFX6-LABEL: global_workgroup_release_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -1914,11 +2425,15 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 @@ -1926,7 +2441,11 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw( ; ; GFX10-WGP-LABEL: global_workgroup_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1938,7 +2457,11 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw( ; ; GFX10-CU-LABEL: global_workgroup_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1951,6 +2474,10 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_workgroup_release_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -1968,7 +2495,11 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1979,7 +2510,11 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1990,7 +2525,11 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_release_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2001,7 +2540,11 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_workgroup_release_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2012,7 +2555,11 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw( ; ; GFX11-WGP-LABEL: global_workgroup_release_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2024,7 +2571,11 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw( ; ; GFX11-CU-LABEL: global_workgroup_release_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2036,7 +2587,11 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw( ; ; GFX12-WGP-LABEL: global_workgroup_release_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -2050,7 +2605,11 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw( ; ; GFX12-CU-LABEL: global_workgroup_release_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -2065,7 +2624,11 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw( ; GFX1250-LABEL: global_workgroup_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2085,6 +2648,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw( ; GFX6-LABEL: global_workgroup_acq_rel_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -2105,11 +2672,15 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 @@ -2117,7 +2688,11 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw( ; ; GFX10-WGP-LABEL: global_workgroup_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2131,7 +2706,11 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw( ; ; GFX10-CU-LABEL: global_workgroup_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2145,6 +2724,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -2162,7 +2745,11 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2173,7 +2760,11 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2186,7 +2777,11 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2197,7 +2792,11 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_workgroup_acq_rel_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2210,7 +2809,11 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw( ; ; GFX11-WGP-LABEL: global_workgroup_acq_rel_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2224,7 +2827,11 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw( ; ; GFX11-CU-LABEL: global_workgroup_acq_rel_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2237,7 +2844,11 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw( ; ; GFX12-WGP-LABEL: global_workgroup_acq_rel_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -2253,7 +2864,11 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw( ; ; GFX12-CU-LABEL: global_workgroup_acq_rel_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -2269,7 +2884,11 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw( ; GFX1250-LABEL: global_workgroup_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2290,6 +2909,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw( ; GFX6-LABEL: global_workgroup_seq_cst_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -2310,11 +2933,15 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 @@ -2322,7 +2949,11 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw( ; ; GFX10-WGP-LABEL: global_workgroup_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2336,7 +2967,11 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw( ; ; GFX10-CU-LABEL: global_workgroup_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2350,6 +2985,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -2367,7 +3006,11 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2378,7 +3021,11 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2391,7 +3038,11 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2402,7 +3053,11 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_workgroup_seq_cst_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2415,7 +3070,11 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw( ; ; GFX11-WGP-LABEL: global_workgroup_seq_cst_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2429,7 +3088,11 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw( ; ; GFX11-CU-LABEL: global_workgroup_seq_cst_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2442,7 +3105,11 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw( ; ; GFX12-WGP-LABEL: global_workgroup_seq_cst_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -2458,7 +3125,11 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw( ; ; GFX12-CU-LABEL: global_workgroup_seq_cst_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -2474,7 +3145,11 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw( ; GFX1250-LABEL: global_workgroup_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2495,6 +3170,10 @@ define amdgpu_kernel void @global_workgroup_acquire_ret_atomicrmw( ; GFX6-LABEL: global_workgroup_acquire_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -2517,6 +3196,10 @@ define amdgpu_kernel void @global_workgroup_acquire_ret_atomicrmw( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -2531,7 +3214,11 @@ define amdgpu_kernel void @global_workgroup_acquire_ret_atomicrmw( ; ; GFX10-WGP-LABEL: global_workgroup_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2544,7 +3231,11 @@ define amdgpu_kernel void @global_workgroup_acquire_ret_atomicrmw( ; ; GFX10-CU-LABEL: global_workgroup_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2557,6 +3248,10 @@ define amdgpu_kernel void @global_workgroup_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_workgroup_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -2575,7 +3270,11 @@ define amdgpu_kernel void @global_workgroup_acquire_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2587,7 +3286,11 @@ define amdgpu_kernel void @global_workgroup_acquire_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2600,7 +3303,11 @@ define amdgpu_kernel void @global_workgroup_acquire_ret_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_acquire_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2612,7 +3319,11 @@ define amdgpu_kernel void @global_workgroup_acquire_ret_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_workgroup_acquire_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2625,7 +3336,11 @@ define amdgpu_kernel void @global_workgroup_acquire_ret_atomicrmw( ; ; GFX11-WGP-LABEL: global_workgroup_acquire_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2638,7 +3353,11 @@ define amdgpu_kernel void @global_workgroup_acquire_ret_atomicrmw( ; ; GFX11-CU-LABEL: global_workgroup_acquire_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2650,7 +3369,11 @@ define amdgpu_kernel void @global_workgroup_acquire_ret_atomicrmw( ; ; GFX12-WGP-LABEL: global_workgroup_acquire_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -2663,7 +3386,11 @@ define amdgpu_kernel void @global_workgroup_acquire_ret_atomicrmw( ; ; GFX12-CU-LABEL: global_workgroup_acquire_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -2676,7 +3403,11 @@ define amdgpu_kernel void @global_workgroup_acquire_ret_atomicrmw( ; GFX1250-LABEL: global_workgroup_acquire_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2697,6 +3428,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw( ; GFX6-LABEL: global_workgroup_acq_rel_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -2720,6 +3455,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -2735,7 +3474,11 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw( ; ; GFX10-WGP-LABEL: global_workgroup_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2750,7 +3493,11 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw( ; ; GFX10-CU-LABEL: global_workgroup_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2765,6 +3512,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -2784,7 +3535,11 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2797,7 +3552,11 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2811,7 +3570,11 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2824,7 +3587,11 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_workgroup_acq_rel_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2838,7 +3605,11 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw( ; ; GFX11-WGP-LABEL: global_workgroup_acq_rel_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2853,7 +3624,11 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw( ; ; GFX11-CU-LABEL: global_workgroup_acq_rel_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2867,7 +3642,11 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw( ; ; GFX12-WGP-LABEL: global_workgroup_acq_rel_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -2884,7 +3663,11 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw( ; ; GFX12-CU-LABEL: global_workgroup_acq_rel_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -2901,7 +3684,11 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw( ; GFX1250-LABEL: global_workgroup_acq_rel_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2924,6 +3711,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw( ; GFX6-LABEL: global_workgroup_seq_cst_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -2947,6 +3738,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -2962,7 +3757,11 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw( ; ; GFX10-WGP-LABEL: global_workgroup_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2977,7 +3776,11 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw( ; ; GFX10-CU-LABEL: global_workgroup_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2992,6 +3795,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -3011,7 +3818,11 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3024,7 +3835,11 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3038,7 +3853,11 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3051,7 +3870,11 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_workgroup_seq_cst_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3065,7 +3888,11 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw( ; ; GFX11-WGP-LABEL: global_workgroup_seq_cst_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -3080,7 +3907,11 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw( ; ; GFX11-CU-LABEL: global_workgroup_seq_cst_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -3094,7 +3925,11 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw( ; ; GFX12-WGP-LABEL: global_workgroup_seq_cst_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -3111,7 +3946,11 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw( ; ; GFX12-CU-LABEL: global_workgroup_seq_cst_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -3128,7 +3967,11 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw( ; GFX1250-LABEL: global_workgroup_seq_cst_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -3152,6 +3995,12 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -3176,11 +4025,16 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -3190,6 +4044,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -3200,7 +4055,13 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3214,7 +4075,13 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3230,6 +4097,12 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -3250,7 +4123,13 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3264,7 +4143,13 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3278,7 +4163,13 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_monotonic_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -3292,7 +4183,13 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_workgroup_monotonic_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -3306,7 +4203,13 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3320,7 +4223,13 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3334,7 +4243,13 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3348,7 +4263,13 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3363,7 +4284,13 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_cmpxchg( ; GFX1250-LABEL: global_workgroup_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -3387,6 +4314,12 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -3411,11 +4344,16 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -3425,6 +4363,7 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -3435,7 +4374,13 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3451,7 +4396,13 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3467,6 +4418,12 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -3487,7 +4444,13 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3501,7 +4464,13 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3517,7 +4486,13 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_acquire_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -3531,7 +4506,13 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_workgroup_acquire_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -3547,7 +4528,13 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3563,7 +4550,13 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3577,7 +4570,13 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3593,7 +4592,13 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3608,7 +4613,13 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg( ; GFX1250-LABEL: global_workgroup_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -3633,6 +4644,12 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -3658,11 +4675,16 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -3672,6 +4694,7 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -3683,7 +4706,13 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3699,7 +4728,13 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3717,6 +4752,12 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -3738,7 +4779,13 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3753,7 +4800,13 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3768,7 +4821,13 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_release_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -3783,7 +4842,13 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_workgroup_release_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -3798,7 +4863,13 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3814,7 +4885,13 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3830,7 +4907,13 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3848,7 +4931,13 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3867,7 +4956,13 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg( ; GFX1250-LABEL: global_workgroup_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -3893,6 +4988,12 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -3918,11 +5019,16 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -3932,6 +5038,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -3943,7 +5050,13 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3961,7 +5074,13 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3980,6 +5099,12 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -4001,7 +5126,13 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4016,7 +5147,13 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4033,7 +5170,13 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -4048,7 +5191,13 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -4065,7 +5214,13 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4083,7 +5238,13 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4100,7 +5261,13 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4120,7 +5287,13 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4140,7 +5313,13 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg( ; GFX1250-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -4167,6 +5346,12 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -4192,11 +5377,16 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -4206,6 +5396,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -4217,7 +5408,13 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4235,7 +5432,13 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4254,6 +5457,12 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -4275,7 +5484,13 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4290,7 +5505,13 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4307,7 +5528,13 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -4322,7 +5549,13 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -4339,7 +5572,13 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4357,7 +5596,13 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4374,7 +5619,13 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4394,7 +5645,13 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4414,7 +5671,13 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg( ; GFX1250-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -4441,6 +5704,12 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -4465,11 +5734,16 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -4479,6 +5753,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -4489,7 +5764,13 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4505,7 +5786,13 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4521,6 +5808,12 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -4541,7 +5834,13 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4555,7 +5854,13 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4571,7 +5876,13 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_monotonic_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -4585,7 +5896,13 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_workgroup_monotonic_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -4601,7 +5918,13 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4617,7 +5940,13 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4631,7 +5960,13 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4647,7 +5982,13 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4662,7 +6003,13 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg( ; GFX1250-LABEL: global_workgroup_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -4687,6 +6034,12 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -4711,11 +6064,16 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -4725,6 +6083,7 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -4735,7 +6094,13 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4751,7 +6116,13 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4767,6 +6138,12 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -4787,7 +6164,13 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4801,7 +6184,13 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4817,7 +6206,13 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_acquire_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -4831,7 +6226,13 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_workgroup_acquire_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -4847,7 +6248,13 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4863,7 +6270,13 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4877,7 +6290,13 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4893,7 +6312,13 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4908,7 +6333,13 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg( ; GFX1250-LABEL: global_workgroup_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -4933,6 +6364,12 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -4958,11 +6395,16 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -4972,6 +6414,7 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -4983,7 +6426,13 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5001,7 +6450,13 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5020,6 +6475,12 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -5041,7 +6502,13 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5056,7 +6523,13 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5073,7 +6546,13 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_release_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -5088,7 +6567,13 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_workgroup_release_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -5105,7 +6590,13 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5123,7 +6614,13 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5140,7 +6637,13 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5160,7 +6663,13 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5180,7 +6689,13 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg( ; GFX1250-LABEL: global_workgroup_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -5207,6 +6722,12 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5232,11 +6753,16 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -5246,6 +6772,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -5257,7 +6784,13 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5275,7 +6808,13 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5294,6 +6833,12 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -5315,7 +6860,13 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5330,7 +6881,13 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5347,7 +6904,13 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -5362,7 +6925,13 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -5379,7 +6948,13 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5397,7 +6972,13 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5414,7 +6995,13 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5434,7 +7021,13 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5454,7 +7047,13 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg( ; GFX1250-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -5481,6 +7080,12 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5506,11 +7111,16 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -5520,6 +7130,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -5531,7 +7142,13 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5549,7 +7166,13 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5568,6 +7191,12 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -5589,7 +7218,13 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5604,7 +7239,13 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5621,7 +7262,13 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -5636,7 +7283,13 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -5653,7 +7306,13 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5671,7 +7330,13 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5688,7 +7353,13 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5708,7 +7379,13 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5728,7 +7405,13 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg( ; GFX1250-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -5755,6 +7438,12 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5780,11 +7469,16 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -5794,6 +7488,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -5805,7 +7500,13 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5823,7 +7524,13 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5841,6 +7548,12 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -5862,7 +7575,13 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5877,7 +7596,13 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5894,7 +7619,13 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -5909,7 +7640,13 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -5926,7 +7663,13 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5944,7 +7687,13 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5960,7 +7709,13 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5980,7 +7735,13 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5999,7 +7760,13 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg( ; GFX1250-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -6026,6 +7793,12 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6051,11 +7824,16 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -6065,6 +7843,7 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -6076,7 +7855,13 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6094,7 +7879,13 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6112,6 +7903,12 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -6133,7 +7930,13 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6148,7 +7951,13 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6165,7 +7974,13 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_acquire_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -6180,7 +7995,13 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_workgroup_acquire_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -6197,7 +8018,13 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6215,7 +8042,13 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6231,7 +8064,13 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6251,7 +8090,13 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6270,7 +8115,13 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg( ; GFX1250-LABEL: global_workgroup_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -6297,6 +8148,12 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6322,11 +8179,16 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -6336,6 +8198,7 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -6347,7 +8210,13 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6365,7 +8234,13 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6384,6 +8259,12 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -6405,7 +8286,13 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6420,7 +8307,13 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6437,7 +8330,13 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_release_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -6452,7 +8351,13 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_workgroup_release_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -6469,7 +8374,13 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6487,7 +8398,13 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6504,7 +8421,13 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6524,7 +8447,13 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6544,7 +8473,13 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg( ; GFX1250-LABEL: global_workgroup_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -6571,6 +8506,12 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6596,11 +8537,16 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -6610,6 +8556,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -6621,7 +8568,13 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6639,7 +8592,13 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6658,6 +8617,12 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -6679,7 +8644,13 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6694,7 +8665,13 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6711,7 +8688,13 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -6726,7 +8709,13 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -6743,7 +8732,13 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6761,7 +8756,13 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6778,7 +8779,13 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6798,7 +8805,13 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6818,7 +8831,13 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX1250-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -6845,6 +8864,12 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6870,11 +8895,16 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -6884,6 +8914,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -6895,7 +8926,13 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6913,7 +8950,13 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6932,6 +8975,12 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -6953,7 +9002,13 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6968,7 +9023,13 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6985,7 +9046,13 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -7000,7 +9067,13 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -7017,7 +9090,13 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7035,7 +9114,13 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7052,7 +9137,13 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7072,7 +9163,13 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7092,7 +9189,13 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX1250-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -7119,6 +9222,12 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7147,6 +9256,12 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -7174,7 +9289,13 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7190,7 +9311,13 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7208,6 +9335,12 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -7231,7 +9364,13 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7247,7 +9386,13 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7263,7 +9408,13 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -7279,7 +9430,13 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -7295,7 +9452,13 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7311,7 +9474,13 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7327,7 +9496,13 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7343,7 +9518,13 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7360,7 +9541,13 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_ret_cmpxchg( ; GFX1250-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -7388,6 +9575,12 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7416,6 +9609,12 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -7443,7 +9642,13 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7460,7 +9665,13 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7478,6 +9689,12 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -7501,7 +9718,13 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7517,7 +9740,13 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7534,7 +9763,13 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -7550,7 +9785,13 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -7567,7 +9808,13 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7584,7 +9831,13 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7600,7 +9853,13 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7617,7 +9876,13 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7634,7 +9899,13 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX1250-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -7662,6 +9933,12 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7691,6 +9968,12 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -7719,7 +10002,13 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7737,7 +10026,13 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7757,6 +10052,12 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -7781,7 +10082,13 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7798,7 +10105,13 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7815,7 +10128,13 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_release_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -7832,7 +10151,13 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_workgroup_release_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -7849,7 +10174,13 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_release_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7867,7 +10198,13 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_release_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7885,7 +10222,13 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7905,7 +10248,13 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7926,7 +10275,13 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg( ; GFX1250-LABEL: global_workgroup_release_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -7956,6 +10311,12 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7985,6 +10346,12 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -8013,7 +10380,13 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8032,7 +10405,13 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8052,6 +10431,12 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -8076,7 +10461,13 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8093,7 +10484,13 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8111,7 +10508,13 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -8128,7 +10531,13 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -8146,7 +10555,13 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8165,7 +10580,13 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8183,7 +10604,13 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8204,7 +10631,13 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8225,7 +10658,13 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX1250-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -8255,6 +10694,12 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -8284,6 +10729,12 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -8312,7 +10763,13 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8331,7 +10788,13 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8351,6 +10814,12 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -8375,7 +10844,13 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8392,7 +10867,13 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8410,7 +10891,13 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -8427,7 +10914,13 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -8445,7 +10938,13 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8464,7 +10963,13 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8482,7 +10987,13 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8503,7 +11014,13 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8524,7 +11041,13 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX1250-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -8554,6 +11077,12 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -8582,6 +11111,12 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -8609,7 +11144,13 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8626,7 +11167,13 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8644,6 +11191,12 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -8667,7 +11220,13 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8683,7 +11242,13 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8700,7 +11265,13 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -8716,7 +11287,13 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -8733,7 +11310,13 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8750,7 +11333,13 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8766,7 +11355,13 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8783,7 +11378,13 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8800,7 +11401,13 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX1250-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -8828,6 +11435,12 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -8856,6 +11469,12 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -8883,7 +11502,13 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8900,7 +11525,13 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8918,6 +11549,12 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -8941,7 +11578,13 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8957,7 +11600,13 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8974,7 +11623,13 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -8990,7 +11645,13 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -9007,7 +11668,13 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9024,7 +11691,13 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9040,7 +11713,13 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9057,7 +11736,13 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9074,7 +11759,13 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_ret_cmpxchg( ; GFX1250-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -9102,6 +11793,12 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -9131,6 +11828,12 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -9159,7 +11862,13 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9178,7 +11887,13 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9198,6 +11913,12 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -9222,7 +11943,13 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9239,7 +11966,13 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9257,7 +11990,13 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_release_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -9274,7 +12013,13 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_workgroup_release_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -9292,7 +12037,13 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9311,7 +12062,13 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9329,7 +12086,13 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9350,7 +12113,13 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9371,7 +12140,13 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg( ; GFX1250-LABEL: global_workgroup_release_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -9401,6 +12176,12 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -9430,6 +12211,12 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -9458,7 +12245,13 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9477,7 +12270,13 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9497,6 +12296,12 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -9521,7 +12326,13 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9538,7 +12349,13 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9556,7 +12373,13 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -9573,7 +12396,13 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -9591,7 +12420,13 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9610,7 +12445,13 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9628,7 +12469,13 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9649,7 +12496,13 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9670,7 +12523,13 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX1250-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -9700,6 +12559,12 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -9729,6 +12594,12 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -9757,7 +12628,13 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9776,7 +12653,13 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9796,6 +12679,12 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -9820,7 +12709,13 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9837,7 +12732,13 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9855,7 +12756,13 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -9872,7 +12779,13 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -9890,7 +12803,13 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9909,7 +12828,13 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9927,7 +12852,13 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9948,7 +12879,13 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9969,7 +12906,13 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX1250-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -9999,6 +12942,12 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -10028,6 +12977,12 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -10056,7 +13011,13 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10075,7 +13036,13 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10095,6 +13062,12 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -10119,7 +13092,13 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10136,7 +13115,13 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10154,7 +13139,13 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -10171,7 +13162,13 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -10189,7 +13186,13 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10208,7 +13211,13 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10226,7 +13235,13 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10247,7 +13262,13 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10268,7 +13289,13 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -10298,6 +13325,12 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -10327,6 +13360,12 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -10355,7 +13394,13 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10374,7 +13419,13 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10394,6 +13445,12 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -10418,7 +13475,13 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10435,7 +13498,13 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10453,7 +13522,13 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -10470,7 +13545,13 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -10488,7 +13569,13 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10507,7 +13594,13 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10525,7 +13618,13 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10546,7 +13645,13 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10567,7 +13672,13 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -10597,6 +13708,12 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -10626,6 +13743,12 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -10654,7 +13777,13 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10673,7 +13802,13 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10693,6 +13828,12 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -10717,7 +13858,13 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10734,7 +13881,13 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10752,7 +13905,13 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -10769,7 +13928,13 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -10787,7 +13952,13 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10806,7 +13977,13 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10824,7 +14001,13 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10845,7 +14028,13 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10866,7 +14055,13 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -10896,6 +14091,12 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -10925,6 +14126,12 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -10953,7 +14160,13 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10972,7 +14185,13 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10992,6 +14211,12 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -11016,7 +14241,13 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -11033,7 +14264,13 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -11051,7 +14288,13 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -11068,7 +14311,13 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -11086,7 +14335,13 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -11105,7 +14360,13 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -11123,7 +14384,13 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -11144,7 +14411,13 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -11165,7 +14438,13 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -11195,6 +14474,12 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -11224,6 +14509,12 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -11252,7 +14543,13 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -11271,7 +14568,13 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -11291,6 +14594,12 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -11315,7 +14624,13 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -11332,7 +14647,13 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -11350,7 +14671,13 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -11367,7 +14694,13 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -11385,7 +14718,13 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -11404,7 +14743,13 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -11422,7 +14767,13 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -11443,7 +14794,13 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -11464,7 +14821,13 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -11493,6 +14856,9 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_load( ; GFX6-LABEL: global_workgroup_one_as_unordered_load: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -11520,12 +14886,16 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -11534,29 +14904,38 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_load( ; ; GFX10-WGP-LABEL: global_workgroup_one_as_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_one_as_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_unordered_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -11581,101 +14960,131 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_one_as_unordered_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_workgroup_one_as_unordered_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_workgroup_one_as_unordered_load: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_workgroup_one_as_unordered_load: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_workgroup_one_as_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_workgroup_one_as_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_workgroup_one_as_unordered_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -11689,6 +15098,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_load( ; GFX6-LABEL: global_workgroup_one_as_monotonic_load: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -11716,12 +15128,16 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -11730,29 +15146,38 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_load( ; ; GFX10-WGP-LABEL: global_workgroup_one_as_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] glc -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_one_as_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_monotonic_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -11777,101 +15202,131 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc0 -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_workgroup_one_as_monotonic_load: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] glc -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_workgroup_one_as_monotonic_load: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_workgroup_one_as_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_workgroup_one_as_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_workgroup_one_as_monotonic_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -11885,6 +15340,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_load( ; GFX6-LABEL: global_workgroup_one_as_acquire_load: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -11912,12 +15370,16 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -11926,30 +15388,40 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_load( ; ; GFX10-WGP-LABEL: global_workgroup_one_as_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_one_as_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acquire_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -11974,105 +15446,138 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_workgroup_one_as_acquire_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_workgroup_one_as_acquire_load: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_workgroup_one_as_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_workgroup_one_as_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_workgroup_one_as_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_workgroup_one_as_acquire_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -12086,6 +15591,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load( ; GFX6-LABEL: global_workgroup_one_as_seq_cst_load: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -12113,12 +15621,16 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -12127,32 +15639,44 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load( ; ; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -12177,117 +15701,154 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_workgroup_one_as_seq_cst_load: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_workgroup_one_as_seq_cst_load: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_workgroup_one_as_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_workgroup_one_as_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -12301,6 +15862,9 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_store( ; GFX6-LABEL: global_workgroup_one_as_unordered_store: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -12322,6 +15886,10 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -12332,27 +15900,38 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_store( ; ; GFX10-WGP-LABEL: global_workgroup_one_as_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_one_as_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_unordered_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -12370,93 +15949,129 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_store( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_one_as_unordered_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_workgroup_one_as_unordered_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_workgroup_one_as_unordered_store: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_workgroup_one_as_unordered_store: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_workgroup_one_as_unordered_store: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_workgroup_one_as_unordered_store: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_workgroup_one_as_unordered_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -12469,6 +16084,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_store( ; GFX6-LABEL: global_workgroup_one_as_monotonic_store: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -12490,6 +16108,10 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -12500,27 +16122,38 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_store( ; ; GFX10-WGP-LABEL: global_workgroup_one_as_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_one_as_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_monotonic_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -12538,93 +16171,129 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_store( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_workgroup_one_as_monotonic_store: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_workgroup_one_as_monotonic_store: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_workgroup_one_as_monotonic_store: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_workgroup_one_as_monotonic_store: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_workgroup_one_as_monotonic_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -12637,6 +16306,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store( ; GFX6-LABEL: global_workgroup_one_as_release_store: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -12658,6 +16330,10 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -12668,24 +16344,30 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store( ; ; GFX10-WGP-LABEL: global_workgroup_one_as_release_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_one_as_release_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm @@ -12693,6 +16375,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store( ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_release_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -12710,109 +16395,141 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_workgroup_one_as_release_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_workgroup_one_as_release_store: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_workgroup_one_as_release_store: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_workgroup_one_as_release_store: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_workgroup_one_as_release_store: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_workgroup_one_as_release_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -12825,6 +16542,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store( ; GFX6-LABEL: global_workgroup_one_as_seq_cst_store: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -12846,6 +16566,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -12856,24 +16580,30 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store( ; ; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm @@ -12881,6 +16611,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store( ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -12898,109 +16631,141 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_workgroup_one_as_seq_cst_store: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_workgroup_one_as_seq_cst_store: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_store: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_workgroup_one_as_seq_cst_store: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: global_workgroup_one_as_seq_cst_store: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -13013,6 +16778,10 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_atomicrmw( ; GFX6-LABEL: global_workgroup_one_as_monotonic_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -13032,18 +16801,26 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_workgroup_one_as_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -13053,7 +16830,11 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_atomicrmw( ; ; GFX10-CU-LABEL: global_workgroup_one_as_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -13064,6 +16845,10 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_monotonic_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -13080,7 +16865,11 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13090,7 +16879,11 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13100,7 +16893,11 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13110,7 +16907,11 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13120,7 +16921,11 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_atomicrmw( ; ; GFX11-WGP-LABEL: global_workgroup_one_as_monotonic_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -13130,7 +16935,11 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_atomicrmw( ; ; GFX11-CU-LABEL: global_workgroup_one_as_monotonic_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -13140,7 +16949,11 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_atomicrmw( ; ; GFX12-WGP-LABEL: global_workgroup_one_as_monotonic_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -13150,7 +16963,11 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_atomicrmw( ; ; GFX12-CU-LABEL: global_workgroup_one_as_monotonic_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -13161,7 +16978,11 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_atomicrmw( ; GFX1250-LABEL: global_workgroup_one_as_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -13179,6 +17000,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_atomicrmw( ; GFX6-LABEL: global_workgroup_one_as_acquire_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -13198,18 +17023,26 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_workgroup_one_as_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -13221,7 +17054,11 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_atomicrmw( ; ; GFX10-CU-LABEL: global_workgroup_one_as_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -13232,6 +17069,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acquire_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -13248,7 +17089,11 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13258,7 +17103,11 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13270,7 +17119,11 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13280,7 +17133,11 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_workgroup_one_as_acquire_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13292,7 +17149,11 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_atomicrmw( ; ; GFX11-WGP-LABEL: global_workgroup_one_as_acquire_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -13304,7 +17165,11 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_atomicrmw( ; ; GFX11-CU-LABEL: global_workgroup_one_as_acquire_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -13314,7 +17179,11 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_atomicrmw( ; ; GFX12-WGP-LABEL: global_workgroup_one_as_acquire_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -13326,7 +17195,11 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_atomicrmw( ; ; GFX12-CU-LABEL: global_workgroup_one_as_acquire_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -13337,7 +17210,11 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_atomicrmw( ; GFX1250-LABEL: global_workgroup_one_as_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -13356,6 +17233,10 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw( ; GFX6-LABEL: global_workgroup_one_as_release_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -13375,18 +17256,26 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_workgroup_one_as_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -13398,7 +17287,11 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw( ; ; GFX10-CU-LABEL: global_workgroup_one_as_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -13411,6 +17304,10 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_release_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -13427,7 +17324,11 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13437,7 +17338,11 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13448,7 +17353,11 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13458,7 +17367,11 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_workgroup_one_as_release_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13469,7 +17382,11 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw( ; ; GFX11-WGP-LABEL: global_workgroup_one_as_release_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -13481,7 +17398,11 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw( ; ; GFX11-CU-LABEL: global_workgroup_one_as_release_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -13493,7 +17414,11 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw( ; ; GFX12-WGP-LABEL: global_workgroup_one_as_release_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -13507,7 +17432,11 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw( ; ; GFX12-CU-LABEL: global_workgroup_one_as_release_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -13522,7 +17451,11 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw( ; GFX1250-LABEL: global_workgroup_one_as_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -13542,6 +17475,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw( ; GFX6-LABEL: global_workgroup_one_as_acq_rel_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -13561,18 +17498,26 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_workgroup_one_as_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -13586,7 +17531,11 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw( ; ; GFX10-CU-LABEL: global_workgroup_one_as_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -13600,6 +17549,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -13616,7 +17569,11 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13626,7 +17583,11 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13639,7 +17600,11 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13649,7 +17614,11 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13662,7 +17631,11 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw( ; ; GFX11-WGP-LABEL: global_workgroup_one_as_acq_rel_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -13676,7 +17649,11 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw( ; ; GFX11-CU-LABEL: global_workgroup_one_as_acq_rel_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -13689,7 +17666,11 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw( ; ; GFX12-WGP-LABEL: global_workgroup_one_as_acq_rel_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -13705,7 +17686,11 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw( ; ; GFX12-CU-LABEL: global_workgroup_one_as_acq_rel_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -13721,7 +17706,11 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw( ; GFX1250-LABEL: global_workgroup_one_as_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -13742,6 +17731,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw( ; GFX6-LABEL: global_workgroup_one_as_seq_cst_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -13761,18 +17754,26 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -13786,7 +17787,11 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw( ; ; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -13800,6 +17805,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -13816,7 +17825,11 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13826,7 +17839,11 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13839,7 +17856,11 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13849,7 +17870,11 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13862,7 +17887,11 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw( ; ; GFX11-WGP-LABEL: global_workgroup_one_as_seq_cst_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -13876,7 +17905,11 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw( ; ; GFX11-CU-LABEL: global_workgroup_one_as_seq_cst_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -13889,7 +17922,11 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw( ; ; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -13905,7 +17942,11 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw( ; ; GFX12-CU-LABEL: global_workgroup_one_as_seq_cst_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -13921,7 +17962,11 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw( ; GFX1250-LABEL: global_workgroup_one_as_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -13942,6 +17987,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_ret_atomicrmw( ; GFX6-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -13964,6 +18013,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_ret_atomicrmw( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -13978,7 +18031,11 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_ret_atomicrmw( ; ; GFX10-WGP-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -13991,7 +18048,11 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_ret_atomicrmw( ; ; GFX10-CU-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -14004,6 +18065,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -14022,7 +18087,11 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14034,7 +18103,11 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14047,7 +18120,11 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_ret_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14059,7 +18136,11 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_ret_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14072,7 +18153,11 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_ret_atomicrmw( ; ; GFX11-WGP-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -14085,7 +18170,11 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_ret_atomicrmw( ; ; GFX11-CU-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -14097,7 +18186,11 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_ret_atomicrmw( ; ; GFX12-WGP-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -14110,7 +18203,11 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_ret_atomicrmw( ; ; GFX12-CU-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -14123,7 +18220,11 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_ret_atomicrmw( ; GFX1250-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -14144,6 +18245,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX6-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -14166,6 +18271,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -14180,7 +18289,11 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw( ; ; GFX10-WGP-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -14195,7 +18308,11 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw( ; ; GFX10-CU-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -14210,6 +18327,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -14228,7 +18349,11 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14240,7 +18365,11 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14254,7 +18383,11 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14266,7 +18399,11 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14280,7 +18417,11 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw( ; ; GFX11-WGP-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -14295,7 +18436,11 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw( ; ; GFX11-CU-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -14309,7 +18454,11 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw( ; ; GFX12-WGP-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -14326,7 +18475,11 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw( ; ; GFX12-CU-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -14343,7 +18496,11 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX1250-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -14366,6 +18523,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX6-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 @@ -14388,6 +18549,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -14402,7 +18567,11 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw( ; ; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -14417,7 +18586,11 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw( ; ; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -14432,6 +18605,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 @@ -14450,7 +18627,11 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14462,7 +18643,11 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14476,7 +18661,11 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14488,7 +18677,11 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14502,7 +18695,11 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw( ; ; GFX11-WGP-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -14517,7 +18714,11 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw( ; ; GFX11-CU-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -14531,7 +18732,11 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw( ; ; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -14548,7 +18753,11 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw( ; ; GFX12-CU-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -14565,7 +18774,11 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX1250-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -14589,6 +18802,12 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -14613,11 +18832,16 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -14627,6 +18851,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -14637,7 +18862,13 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14651,7 +18882,13 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14667,6 +18904,12 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -14687,7 +18930,13 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14701,7 +18950,13 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14715,7 +18970,13 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -14729,7 +18990,13 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -14743,7 +19010,13 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14757,7 +19030,13 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14771,7 +19050,13 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14785,7 +19070,13 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14800,7 +19091,13 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_cmpxchg( ; GFX1250-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -14824,6 +19121,12 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -14848,11 +19151,16 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -14862,6 +19170,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -14872,7 +19181,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14888,7 +19203,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14904,6 +19225,12 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -14924,7 +19251,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14938,7 +19271,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14954,7 +19293,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -14968,7 +19313,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -14984,7 +19335,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15000,7 +19357,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15014,7 +19377,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15030,7 +19399,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15045,7 +19420,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX1250-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -15070,6 +19451,12 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15094,11 +19481,16 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -15108,6 +19500,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -15118,7 +19511,13 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15134,7 +19533,13 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15152,6 +19557,12 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -15172,7 +19583,13 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15186,7 +19603,13 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15201,7 +19624,13 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -15215,7 +19644,13 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -15230,7 +19665,13 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15246,7 +19687,13 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15262,7 +19709,13 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15280,7 +19733,13 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15299,7 +19758,13 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg( ; GFX1250-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -15325,6 +19790,12 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15349,11 +19820,16 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -15363,6 +19839,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -15373,7 +19850,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15391,7 +19874,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15410,6 +19899,12 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -15430,7 +19925,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15444,7 +19945,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15461,7 +19968,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -15475,7 +19988,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -15492,7 +20011,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15510,7 +20035,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15527,7 +20058,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15547,7 +20084,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15567,7 +20110,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX1250-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -15594,6 +20143,12 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15618,11 +20173,16 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -15632,6 +20192,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -15642,7 +20203,13 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15660,7 +20227,13 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15679,6 +20252,12 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -15699,7 +20278,13 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15713,7 +20298,13 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15730,7 +20321,13 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -15744,7 +20341,13 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -15761,7 +20364,13 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15779,7 +20388,13 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15796,7 +20411,13 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15816,7 +20437,13 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15836,7 +20463,13 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX1250-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -15863,6 +20496,12 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15887,11 +20526,16 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -15901,6 +20545,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -15911,7 +20556,13 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15927,7 +20578,13 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15943,6 +20600,12 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -15963,7 +20626,13 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15977,7 +20646,13 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15993,7 +20668,13 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -16007,7 +20688,13 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -16023,7 +20710,13 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16039,7 +20732,13 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16053,7 +20752,13 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16069,7 +20774,13 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16084,7 +20795,13 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX1250-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -16109,6 +20826,12 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -16133,11 +20856,16 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -16147,6 +20875,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -16157,7 +20886,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16173,7 +20908,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16189,6 +20930,12 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16209,7 +20956,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16223,7 +20976,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16239,7 +20998,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -16253,7 +21018,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -16269,7 +21040,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16285,7 +21062,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16299,7 +21082,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16315,7 +21104,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16330,7 +21125,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX1250-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -16355,6 +21156,12 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -16379,11 +21186,16 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -16393,6 +21205,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -16403,7 +21216,13 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16421,7 +21240,13 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16440,6 +21265,12 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16460,7 +21291,13 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16474,7 +21311,13 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16491,7 +21334,13 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -16505,7 +21354,13 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -16522,7 +21377,13 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16540,7 +21401,13 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16557,7 +21424,13 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16577,7 +21450,13 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16597,7 +21476,13 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg( ; GFX1250-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -16624,6 +21509,12 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -16648,11 +21539,16 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -16662,6 +21558,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -16672,7 +21569,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16690,7 +21593,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16709,6 +21618,12 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16729,7 +21644,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16743,7 +21664,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16760,7 +21687,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -16774,7 +21707,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -16791,7 +21730,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16809,7 +21754,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16826,7 +21777,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16846,7 +21803,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16866,7 +21829,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX1250-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -16893,6 +21862,12 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -16917,11 +21892,16 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -16931,6 +21911,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -16941,7 +21922,13 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16959,7 +21946,13 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16978,6 +21971,12 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16998,7 +21997,13 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17012,7 +22017,13 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17029,7 +22040,13 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -17043,7 +22060,13 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -17060,7 +22083,13 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17078,7 +22107,13 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17095,7 +22130,13 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17115,7 +22156,13 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17135,7 +22182,13 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX1250-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -17162,6 +22215,12 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -17186,11 +22245,16 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -17200,6 +22264,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -17210,7 +22275,13 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17228,7 +22299,13 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17246,6 +22323,12 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -17266,7 +22349,13 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17280,7 +22369,13 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17297,7 +22392,13 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -17311,7 +22412,13 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -17328,7 +22435,13 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17346,7 +22459,13 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17362,7 +22481,13 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17382,7 +22507,13 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17401,7 +22532,13 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX1250-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -17428,6 +22565,12 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -17452,11 +22595,16 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -17466,6 +22614,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -17476,7 +22625,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17494,7 +22649,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17512,6 +22673,12 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -17532,7 +22699,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17546,7 +22719,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17563,7 +22742,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -17577,7 +22762,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -17594,7 +22785,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17612,7 +22809,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17628,7 +22831,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17648,7 +22857,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17667,7 +22882,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX1250-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -17694,6 +22915,12 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -17718,11 +22945,16 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -17732,6 +22964,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -17742,7 +22975,13 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17760,7 +22999,13 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17779,6 +23024,12 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -17799,7 +23050,13 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17813,7 +23070,13 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17830,7 +23093,13 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -17844,7 +23113,13 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -17861,7 +23136,13 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17879,7 +23160,13 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17896,7 +23183,13 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17916,7 +23209,13 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17936,7 +23235,13 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX1250-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -17963,6 +23268,12 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -17987,11 +23298,16 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -18001,6 +23317,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -18011,7 +23328,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18029,7 +23352,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18048,6 +23377,12 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -18068,7 +23403,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18082,7 +23423,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18099,7 +23446,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -18113,7 +23466,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -18130,7 +23489,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18148,7 +23513,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18165,7 +23536,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18185,7 +23562,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18205,7 +23588,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX1250-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -18232,6 +23621,12 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -18256,11 +23651,16 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 ; GFX7-NEXT: s_mov_b64 s[10:11], 16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, s8 ; GFX7-NEXT: s_mov_b32 s5, s9 ; GFX7-NEXT: s_mov_b32 s9, s10 @@ -18270,6 +23670,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX7-NEXT: s_mov_b32 s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -18280,7 +23681,13 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18298,7 +23705,13 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18317,6 +23730,12 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -18337,7 +23756,13 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18351,7 +23776,13 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18368,7 +23799,13 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -18382,7 +23819,13 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -18399,7 +23842,13 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18417,7 +23866,13 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18434,7 +23889,13 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18454,7 +23915,13 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18474,7 +23941,13 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX1250-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -18501,6 +23974,12 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_ret_cmpxc ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -18529,6 +24008,12 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_ret_cmpxc ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -18556,7 +24041,13 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_ret_cmpxc ; ; GFX10-WGP-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18572,7 +24063,13 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_ret_cmpxc ; ; GFX10-CU-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18590,6 +24087,12 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_ret_cmpxc ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -18613,7 +24116,13 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_ret_cmpxc ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18629,7 +24138,13 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_ret_cmpxc ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18645,7 +24160,13 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_ret_cmpxc ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -18661,7 +24182,13 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_ret_cmpxc ; ; GFX942-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -18677,7 +24204,13 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_ret_cmpxc ; ; GFX11-WGP-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18693,7 +24226,13 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_ret_cmpxc ; ; GFX11-CU-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18709,7 +24248,13 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_ret_cmpxc ; ; GFX12-WGP-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18725,7 +24270,13 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_ret_cmpxc ; ; GFX12-CU-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18742,7 +24293,13 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_ret_cmpxc ; GFX1250-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -18770,6 +24327,12 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_ret_cmpxchg ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -18798,6 +24361,12 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_ret_cmpxchg ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -18825,7 +24394,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_ret_cmpxchg ; ; GFX10-WGP-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18842,7 +24417,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_ret_cmpxchg ; ; GFX10-CU-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18860,6 +24441,12 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_ret_cmpxchg ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -18883,7 +24470,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_ret_cmpxchg ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18899,7 +24492,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_ret_cmpxchg ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18916,7 +24515,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_ret_cmpxchg ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -18932,7 +24537,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_ret_cmpxchg ; ; GFX942-TGSPLIT-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -18949,7 +24560,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_ret_cmpxchg ; ; GFX11-WGP-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18966,7 +24583,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_ret_cmpxchg ; ; GFX11-CU-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18982,7 +24605,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_ret_cmpxchg ; ; GFX12-WGP-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18999,7 +24628,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_ret_cmpxchg ; ; GFX12-CU-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19016,7 +24651,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_ret_cmpxchg ; GFX1250-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -19044,6 +24685,12 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -19072,6 +24719,12 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -19099,7 +24752,13 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg ; ; GFX10-WGP-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19117,7 +24776,13 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg ; ; GFX10-CU-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19137,6 +24802,12 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -19160,7 +24831,13 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19176,7 +24853,13 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19193,7 +24876,13 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -19209,7 +24898,13 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg ; ; GFX942-TGSPLIT-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -19226,7 +24921,13 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg ; ; GFX11-WGP-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19244,7 +24945,13 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg ; ; GFX11-CU-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19262,7 +24969,13 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg ; ; GFX12-WGP-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19282,7 +24995,13 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg ; ; GFX12-CU-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19303,7 +25022,13 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg ; GFX1250-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -19333,6 +25058,12 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -19361,6 +25092,12 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -19388,7 +25125,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg ; ; GFX10-WGP-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19407,7 +25150,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg ; ; GFX10-CU-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19427,6 +25176,12 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -19450,7 +25205,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19466,7 +25227,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19484,7 +25251,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -19500,7 +25273,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg ; ; GFX942-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -19518,7 +25297,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg ; ; GFX11-WGP-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19537,7 +25322,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg ; ; GFX11-CU-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19555,7 +25346,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg ; ; GFX12-WGP-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19576,7 +25373,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg ; ; GFX12-CU-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19597,7 +25400,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg ; GFX1250-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -19627,6 +25436,12 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -19655,6 +25470,12 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -19682,7 +25503,13 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg ; ; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19701,7 +25528,13 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg ; ; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19721,6 +25554,12 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -19744,7 +25583,13 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19760,7 +25605,13 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19778,7 +25629,13 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -19794,7 +25651,13 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg ; ; GFX942-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -19812,7 +25675,13 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg ; ; GFX11-WGP-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19831,7 +25700,13 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg ; ; GFX11-CU-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19849,7 +25724,13 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg ; ; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19870,7 +25751,13 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg ; ; GFX12-CU-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19891,7 +25778,13 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg ; GFX1250-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -19921,6 +25814,12 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_ret_cmpxchg ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -19949,6 +25848,12 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_ret_cmpxchg ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -19976,7 +25881,13 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_ret_cmpxchg ; ; GFX10-WGP-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19993,7 +25904,13 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_ret_cmpxchg ; ; GFX10-CU-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20011,6 +25928,12 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_ret_cmpxchg ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -20034,7 +25957,13 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_ret_cmpxchg ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20050,7 +25979,13 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_ret_cmpxchg ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20067,7 +26002,13 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_ret_cmpxchg ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -20083,7 +26024,13 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_ret_cmpxchg ; ; GFX942-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -20100,7 +26047,13 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_ret_cmpxchg ; ; GFX11-WGP-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20117,7 +26070,13 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_ret_cmpxchg ; ; GFX11-CU-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20133,7 +26092,13 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_ret_cmpxchg ; ; GFX12-WGP-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20150,7 +26115,13 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_ret_cmpxchg ; ; GFX12-CU-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20167,7 +26138,13 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_ret_cmpxchg ; GFX1250-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -20195,6 +26172,12 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -20223,6 +26206,12 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -20250,7 +26239,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20267,7 +26262,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20285,6 +26286,12 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -20308,7 +26315,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20324,7 +26337,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20341,7 +26360,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -20357,7 +26382,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -20374,7 +26405,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20391,7 +26428,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20407,7 +26450,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20424,7 +26473,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20441,7 +26496,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX1250-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -20469,6 +26530,12 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -20497,6 +26564,12 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -20524,7 +26597,13 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20543,7 +26622,13 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20563,6 +26648,12 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -20586,7 +26677,13 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20602,7 +26699,13 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20620,7 +26723,13 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -20636,7 +26745,13 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -20654,7 +26769,13 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20673,7 +26794,13 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20691,7 +26818,13 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20712,7 +26845,13 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20733,7 +26872,13 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX1250-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -20763,6 +26908,12 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -20791,6 +26942,12 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -20818,7 +26975,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20837,7 +27000,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20857,6 +27026,12 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -20880,7 +27055,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20896,7 +27077,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20914,7 +27101,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -20930,7 +27123,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -20948,7 +27147,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20967,7 +27172,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20985,7 +27196,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21006,7 +27223,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21027,7 +27250,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX1250-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -21057,6 +27286,12 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -21085,6 +27320,12 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -21112,7 +27353,13 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21131,7 +27378,13 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21151,6 +27404,12 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -21174,7 +27433,13 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21190,7 +27455,13 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21208,7 +27479,13 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -21224,7 +27501,13 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -21242,7 +27525,13 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21261,7 +27550,13 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21279,7 +27574,13 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21300,7 +27601,13 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21321,7 +27628,13 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX1250-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -21351,6 +27664,12 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -21379,6 +27698,12 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -21406,7 +27731,13 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg ; ; GFX10-WGP-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21425,7 +27756,13 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg ; ; GFX10-CU-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21445,6 +27782,12 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -21468,7 +27811,13 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21484,7 +27833,13 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21502,7 +27857,13 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -21518,7 +27879,13 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg ; ; GFX942-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -21536,7 +27903,13 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg ; ; GFX11-WGP-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21555,7 +27928,13 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg ; ; GFX11-CU-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21573,7 +27952,13 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg ; ; GFX12-WGP-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21594,7 +27979,13 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg ; ; GFX12-CU-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21615,7 +28006,13 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg ; GFX1250-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -21645,6 +28042,12 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -21673,6 +28076,12 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -21700,7 +28109,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21719,7 +28134,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21739,6 +28160,12 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -21762,7 +28189,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21778,7 +28211,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21796,7 +28235,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -21812,7 +28257,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -21830,7 +28281,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21849,7 +28306,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21867,7 +28330,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21888,7 +28357,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21909,7 +28384,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -21939,6 +28420,12 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -21967,6 +28454,12 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -21994,7 +28487,13 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -22013,7 +28512,13 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -22033,6 +28538,12 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -22056,7 +28567,13 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -22072,7 +28589,13 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -22090,7 +28613,13 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -22106,7 +28635,13 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -22124,7 +28659,13 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -22143,7 +28684,13 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -22161,7 +28708,13 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -22182,7 +28735,13 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -22203,7 +28762,13 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -22233,6 +28798,12 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -22261,6 +28832,12 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -22288,7 +28865,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -22307,7 +28890,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -22327,6 +28916,12 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -22350,7 +28945,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -22366,7 +28967,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -22384,7 +28991,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -22400,7 +29013,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -22418,7 +29037,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -22437,7 +29062,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -22455,7 +29086,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -22476,7 +29113,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -22497,7 +29140,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv @@ -22527,6 +29176,12 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -22555,6 +29210,12 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 ; GFX7-NEXT: s_mov_b64 s[12:13], 16 @@ -22582,7 +29243,13 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc @@ -22601,7 +29268,13 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc @@ -22621,6 +29294,12 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -22644,7 +29323,13 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -22660,7 +29345,13 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0xc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -22678,7 +29369,13 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -22694,7 +29391,13 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0xc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -22712,7 +29415,13 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -22731,7 +29440,13 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -22749,7 +29464,13 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -22770,7 +29491,13 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0xc ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -22791,7 +29518,13 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc nv diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll index 7fe33ac93ee0..a4fcdf3f31d8 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll @@ -33,135 +33,169 @@ define amdgpu_kernel void @local_agent_unordered_load( ; ; GFX7-LABEL: local_agent_unordered_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: ds_read_b32 v1, v0 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_agent_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_agent_unordered_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_agent_unordered_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_unordered_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_agent_unordered_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_agent_unordered_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_agent_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: ds_load_b32 v1, v0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: ds_store_b32 v0, v1 @@ -169,11 +203,15 @@ define amdgpu_kernel void @local_agent_unordered_load( ; ; GFX12-CU-LABEL: local_agent_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 @@ -182,11 +220,15 @@ define amdgpu_kernel void @local_agent_unordered_load( ; GFX1250-LABEL: local_agent_unordered_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: ds_store_b32 v0, v1 @@ -217,135 +259,169 @@ define amdgpu_kernel void @local_agent_monotonic_load( ; ; GFX7-LABEL: local_agent_monotonic_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: ds_read_b32 v1, v0 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_agent_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_agent_monotonic_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_agent_monotonic_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_monotonic_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_agent_monotonic_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_agent_monotonic_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_agent_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: ds_load_b32 v1, v0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: ds_store_b32 v0, v1 @@ -353,11 +429,15 @@ define amdgpu_kernel void @local_agent_monotonic_load( ; ; GFX12-CU-LABEL: local_agent_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 @@ -366,11 +446,15 @@ define amdgpu_kernel void @local_agent_monotonic_load( ; GFX1250-LABEL: local_agent_monotonic_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: ds_store_b32 v0, v1 @@ -401,10 +485,13 @@ define amdgpu_kernel void @local_agent_acquire_load( ; ; GFX7-LABEL: local_agent_acquire_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: ds_read_b32 v1, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -415,9 +502,12 @@ define amdgpu_kernel void @local_agent_acquire_load( ; ; GFX10-WGP-LABEL: local_agent_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -428,9 +518,12 @@ define amdgpu_kernel void @local_agent_acquire_load( ; ; GFX10-CU-LABEL: local_agent_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -440,10 +533,13 @@ define amdgpu_kernel void @local_agent_acquire_load( ; ; SKIP-CACHE-INV-LABEL: local_agent_acquire_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -454,9 +550,12 @@ define amdgpu_kernel void @local_agent_acquire_load( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -466,22 +565,28 @@ define amdgpu_kernel void @local_agent_acquire_load( ; ; GFX90A-TGSPLIT-LABEL: local_agent_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_agent_acquire_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -491,22 +596,28 @@ define amdgpu_kernel void @local_agent_acquire_load( ; ; GFX942-TGSPLIT-LABEL: local_agent_acquire_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v0 ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 -; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_agent_acquire_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -517,9 +628,12 @@ define amdgpu_kernel void @local_agent_acquire_load( ; ; GFX11-CU-LABEL: local_agent_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -529,25 +643,33 @@ define amdgpu_kernel void @local_agent_acquire_load( ; ; GFX12-WGP-LABEL: local_agent_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: ds_load_b32 v1, v0 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: ds_store_b32 v0, v1 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_agent_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm @@ -555,12 +677,16 @@ define amdgpu_kernel void @local_agent_acquire_load( ; GFX1250-LABEL: local_agent_acquire_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: ds_load_b32 v1, v0 ; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: ds_store_b32 v0, v1 ; GFX1250-NEXT: s_endpgm @@ -591,10 +717,13 @@ define amdgpu_kernel void @local_agent_seq_cst_load( ; ; GFX7-LABEL: local_agent_seq_cst_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_read_b32 v1, v0 @@ -606,9 +735,12 @@ define amdgpu_kernel void @local_agent_seq_cst_load( ; ; GFX10-WGP-LABEL: local_agent_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -621,9 +753,12 @@ define amdgpu_kernel void @local_agent_seq_cst_load( ; ; GFX10-CU-LABEL: local_agent_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -635,10 +770,13 @@ define amdgpu_kernel void @local_agent_seq_cst_load( ; ; SKIP-CACHE-INV-LABEL: local_agent_seq_cst_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 @@ -650,9 +788,12 @@ define amdgpu_kernel void @local_agent_seq_cst_load( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -663,9 +804,12 @@ define amdgpu_kernel void @local_agent_seq_cst_load( ; ; GFX90A-TGSPLIT-LABEL: local_agent_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -677,9 +821,12 @@ define amdgpu_kernel void @local_agent_seq_cst_load( ; ; GFX942-NOTTGSPLIT-LABEL: local_agent_seq_cst_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -690,9 +837,12 @@ define amdgpu_kernel void @local_agent_seq_cst_load( ; ; GFX942-TGSPLIT-LABEL: local_agent_seq_cst_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -704,9 +854,12 @@ define amdgpu_kernel void @local_agent_seq_cst_load( ; ; GFX11-WGP-LABEL: local_agent_seq_cst_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -719,9 +872,12 @@ define amdgpu_kernel void @local_agent_seq_cst_load( ; ; GFX11-CU-LABEL: local_agent_seq_cst_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -733,9 +889,12 @@ define amdgpu_kernel void @local_agent_seq_cst_load( ; ; GFX12-WGP-LABEL: local_agent_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 @@ -744,15 +903,19 @@ define amdgpu_kernel void @local_agent_seq_cst_load( ; GFX12-WGP-NEXT: ds_load_b32 v1, v0 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: ds_store_b32 v0, v1 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_agent_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 @@ -760,6 +923,7 @@ define amdgpu_kernel void @local_agent_seq_cst_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm @@ -767,14 +931,18 @@ define amdgpu_kernel void @local_agent_seq_cst_load( ; GFX1250-LABEL: local_agent_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_load_b32 v1, v0 ; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: ds_store_b32 v0, v1 ; GFX1250-NEXT: s_endpgm @@ -788,12 +956,14 @@ entry: define amdgpu_kernel void @local_agent_unordered_store( ; GFX6-LABEL: local_agent_unordered_store: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm @@ -801,6 +971,10 @@ define amdgpu_kernel void @local_agent_unordered_store( ; GFX7-LABEL: local_agent_unordered_store: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -812,6 +986,10 @@ define amdgpu_kernel void @local_agent_unordered_store( ; GFX10-WGP-LABEL: local_agent_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 @@ -822,6 +1000,10 @@ define amdgpu_kernel void @local_agent_unordered_store( ; GFX10-CU-LABEL: local_agent_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 @@ -832,6 +1014,10 @@ define amdgpu_kernel void @local_agent_unordered_store( ; SKIP-CACHE-INV-LABEL: local_agent_unordered_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -843,6 +1029,10 @@ define amdgpu_kernel void @local_agent_unordered_store( ; GFX90A-NOTTGSPLIT-LABEL: local_agent_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -853,6 +1043,10 @@ define amdgpu_kernel void @local_agent_unordered_store( ; GFX90A-TGSPLIT-LABEL: local_agent_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -863,6 +1057,10 @@ define amdgpu_kernel void @local_agent_unordered_store( ; GFX942-NOTTGSPLIT-LABEL: local_agent_unordered_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -873,6 +1071,10 @@ define amdgpu_kernel void @local_agent_unordered_store( ; GFX942-TGSPLIT-LABEL: local_agent_unordered_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -883,6 +1085,10 @@ define amdgpu_kernel void @local_agent_unordered_store( ; GFX11-WGP-LABEL: local_agent_unordered_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -893,6 +1099,10 @@ define amdgpu_kernel void @local_agent_unordered_store( ; GFX11-CU-LABEL: local_agent_unordered_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -903,6 +1113,10 @@ define amdgpu_kernel void @local_agent_unordered_store( ; GFX12-WGP-LABEL: local_agent_unordered_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -913,6 +1127,10 @@ define amdgpu_kernel void @local_agent_unordered_store( ; GFX12-CU-LABEL: local_agent_unordered_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -924,6 +1142,10 @@ define amdgpu_kernel void @local_agent_unordered_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 @@ -939,12 +1161,14 @@ entry: define amdgpu_kernel void @local_agent_monotonic_store( ; GFX6-LABEL: local_agent_monotonic_store: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm @@ -952,6 +1176,10 @@ define amdgpu_kernel void @local_agent_monotonic_store( ; GFX7-LABEL: local_agent_monotonic_store: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -963,6 +1191,10 @@ define amdgpu_kernel void @local_agent_monotonic_store( ; GFX10-WGP-LABEL: local_agent_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 @@ -973,6 +1205,10 @@ define amdgpu_kernel void @local_agent_monotonic_store( ; GFX10-CU-LABEL: local_agent_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 @@ -983,6 +1219,10 @@ define amdgpu_kernel void @local_agent_monotonic_store( ; SKIP-CACHE-INV-LABEL: local_agent_monotonic_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -994,6 +1234,10 @@ define amdgpu_kernel void @local_agent_monotonic_store( ; GFX90A-NOTTGSPLIT-LABEL: local_agent_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -1004,6 +1248,10 @@ define amdgpu_kernel void @local_agent_monotonic_store( ; GFX90A-TGSPLIT-LABEL: local_agent_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -1014,6 +1262,10 @@ define amdgpu_kernel void @local_agent_monotonic_store( ; GFX942-NOTTGSPLIT-LABEL: local_agent_monotonic_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -1024,6 +1276,10 @@ define amdgpu_kernel void @local_agent_monotonic_store( ; GFX942-TGSPLIT-LABEL: local_agent_monotonic_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -1034,6 +1290,10 @@ define amdgpu_kernel void @local_agent_monotonic_store( ; GFX11-WGP-LABEL: local_agent_monotonic_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -1044,6 +1304,10 @@ define amdgpu_kernel void @local_agent_monotonic_store( ; GFX11-CU-LABEL: local_agent_monotonic_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -1054,6 +1318,10 @@ define amdgpu_kernel void @local_agent_monotonic_store( ; GFX12-WGP-LABEL: local_agent_monotonic_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -1064,6 +1332,10 @@ define amdgpu_kernel void @local_agent_monotonic_store( ; GFX12-CU-LABEL: local_agent_monotonic_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -1075,6 +1347,10 @@ define amdgpu_kernel void @local_agent_monotonic_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 @@ -1090,12 +1366,14 @@ entry: define amdgpu_kernel void @local_agent_release_store( ; GFX6-LABEL: local_agent_release_store: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -1104,6 +1382,10 @@ define amdgpu_kernel void @local_agent_release_store( ; GFX7-LABEL: local_agent_release_store: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1116,6 +1398,10 @@ define amdgpu_kernel void @local_agent_release_store( ; GFX10-WGP-LABEL: local_agent_release_store: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 @@ -1128,6 +1414,10 @@ define amdgpu_kernel void @local_agent_release_store( ; GFX10-CU-LABEL: local_agent_release_store: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 @@ -1140,6 +1430,10 @@ define amdgpu_kernel void @local_agent_release_store( ; SKIP-CACHE-INV-LABEL: local_agent_release_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -1152,6 +1446,10 @@ define amdgpu_kernel void @local_agent_release_store( ; GFX90A-NOTTGSPLIT-LABEL: local_agent_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -1163,6 +1461,10 @@ define amdgpu_kernel void @local_agent_release_store( ; GFX90A-TGSPLIT-LABEL: local_agent_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -1174,6 +1476,10 @@ define amdgpu_kernel void @local_agent_release_store( ; GFX942-NOTTGSPLIT-LABEL: local_agent_release_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -1185,6 +1491,10 @@ define amdgpu_kernel void @local_agent_release_store( ; GFX942-TGSPLIT-LABEL: local_agent_release_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -1196,6 +1506,10 @@ define amdgpu_kernel void @local_agent_release_store( ; GFX11-WGP-LABEL: local_agent_release_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -1208,6 +1522,10 @@ define amdgpu_kernel void @local_agent_release_store( ; GFX11-CU-LABEL: local_agent_release_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -1220,6 +1538,10 @@ define amdgpu_kernel void @local_agent_release_store( ; GFX12-WGP-LABEL: local_agent_release_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -1234,6 +1556,10 @@ define amdgpu_kernel void @local_agent_release_store( ; GFX12-CU-LABEL: local_agent_release_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -1249,6 +1575,10 @@ define amdgpu_kernel void @local_agent_release_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 @@ -1266,12 +1596,14 @@ entry: define amdgpu_kernel void @local_agent_seq_cst_store( ; GFX6-LABEL: local_agent_seq_cst_store: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -1280,6 +1612,10 @@ define amdgpu_kernel void @local_agent_seq_cst_store( ; GFX7-LABEL: local_agent_seq_cst_store: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1292,6 +1628,10 @@ define amdgpu_kernel void @local_agent_seq_cst_store( ; GFX10-WGP-LABEL: local_agent_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 @@ -1304,6 +1644,10 @@ define amdgpu_kernel void @local_agent_seq_cst_store( ; GFX10-CU-LABEL: local_agent_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 @@ -1316,6 +1660,10 @@ define amdgpu_kernel void @local_agent_seq_cst_store( ; SKIP-CACHE-INV-LABEL: local_agent_seq_cst_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -1328,6 +1676,10 @@ define amdgpu_kernel void @local_agent_seq_cst_store( ; GFX90A-NOTTGSPLIT-LABEL: local_agent_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -1339,6 +1691,10 @@ define amdgpu_kernel void @local_agent_seq_cst_store( ; GFX90A-TGSPLIT-LABEL: local_agent_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -1350,6 +1706,10 @@ define amdgpu_kernel void @local_agent_seq_cst_store( ; GFX942-NOTTGSPLIT-LABEL: local_agent_seq_cst_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -1361,6 +1721,10 @@ define amdgpu_kernel void @local_agent_seq_cst_store( ; GFX942-TGSPLIT-LABEL: local_agent_seq_cst_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -1372,6 +1736,10 @@ define amdgpu_kernel void @local_agent_seq_cst_store( ; GFX11-WGP-LABEL: local_agent_seq_cst_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -1384,6 +1752,10 @@ define amdgpu_kernel void @local_agent_seq_cst_store( ; GFX11-CU-LABEL: local_agent_seq_cst_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -1396,6 +1768,10 @@ define amdgpu_kernel void @local_agent_seq_cst_store( ; GFX12-WGP-LABEL: local_agent_seq_cst_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -1410,6 +1786,10 @@ define amdgpu_kernel void @local_agent_seq_cst_store( ; GFX12-CU-LABEL: local_agent_seq_cst_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -1425,6 +1805,10 @@ define amdgpu_kernel void @local_agent_seq_cst_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 @@ -1443,133 +1827,183 @@ define amdgpu_kernel void @local_agent_monotonic_atomicrmw( ; GFX6-LABEL: local_agent_monotonic_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_agent_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_agent_monotonic_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_agent_monotonic_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_monotonic_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_agent_monotonic_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_agent_monotonic_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_agent_monotonic_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_agent_monotonic_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm @@ -1577,10 +2011,14 @@ define amdgpu_kernel void @local_agent_monotonic_atomicrmw( ; GFX1250-LABEL: local_agent_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s0 ; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX1250-NEXT: s_endpgm @@ -1594,11 +2032,13 @@ define amdgpu_kernel void @local_agent_acquire_atomicrmw( ; GFX6-LABEL: local_agent_acquire_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -1606,11 +2046,15 @@ define amdgpu_kernel void @local_agent_acquire_atomicrmw( ; ; GFX7-LABEL: local_agent_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1618,10 +2062,14 @@ define amdgpu_kernel void @local_agent_acquire_atomicrmw( ; ; GFX10-WGP-LABEL: local_agent_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1630,10 +2078,14 @@ define amdgpu_kernel void @local_agent_acquire_atomicrmw( ; ; GFX10-CU-LABEL: local_agent_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1641,11 +2093,15 @@ define amdgpu_kernel void @local_agent_acquire_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_agent_acquire_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -1653,10 +2109,14 @@ define amdgpu_kernel void @local_agent_acquire_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1664,10 +2124,14 @@ define amdgpu_kernel void @local_agent_acquire_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_agent_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol @@ -1675,10 +2139,14 @@ define amdgpu_kernel void @local_agent_acquire_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: local_agent_acquire_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1686,10 +2154,14 @@ define amdgpu_kernel void @local_agent_acquire_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: local_agent_acquire_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 @@ -1697,10 +2169,14 @@ define amdgpu_kernel void @local_agent_acquire_atomicrmw( ; ; GFX11-WGP-LABEL: local_agent_acquire_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1709,10 +2185,14 @@ define amdgpu_kernel void @local_agent_acquire_atomicrmw( ; ; GFX11-CU-LABEL: local_agent_acquire_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1720,10 +2200,14 @@ define amdgpu_kernel void @local_agent_acquire_atomicrmw( ; ; GFX12-WGP-LABEL: local_agent_acquire_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0 @@ -1732,10 +2216,14 @@ define amdgpu_kernel void @local_agent_acquire_atomicrmw( ; ; GFX12-CU-LABEL: local_agent_acquire_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 @@ -1744,10 +2232,14 @@ define amdgpu_kernel void @local_agent_acquire_atomicrmw( ; GFX1250-LABEL: local_agent_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s0 ; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -1762,11 +2254,13 @@ define amdgpu_kernel void @local_agent_release_atomicrmw( ; GFX6-LABEL: local_agent_release_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 @@ -1774,11 +2268,15 @@ define amdgpu_kernel void @local_agent_release_atomicrmw( ; ; GFX7-LABEL: local_agent_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 @@ -1786,10 +2284,14 @@ define amdgpu_kernel void @local_agent_release_atomicrmw( ; ; GFX10-WGP-LABEL: local_agent_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1798,10 +2300,14 @@ define amdgpu_kernel void @local_agent_release_atomicrmw( ; ; GFX10-CU-LABEL: local_agent_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1810,11 +2316,15 @@ define amdgpu_kernel void @local_agent_release_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_agent_release_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 @@ -1822,10 +2332,14 @@ define amdgpu_kernel void @local_agent_release_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 @@ -1833,10 +2347,14 @@ define amdgpu_kernel void @local_agent_release_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_agent_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 @@ -1844,10 +2362,14 @@ define amdgpu_kernel void @local_agent_release_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: local_agent_release_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 @@ -1855,10 +2377,14 @@ define amdgpu_kernel void @local_agent_release_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: local_agent_release_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 @@ -1866,10 +2392,14 @@ define amdgpu_kernel void @local_agent_release_atomicrmw( ; ; GFX11-WGP-LABEL: local_agent_release_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1878,10 +2408,14 @@ define amdgpu_kernel void @local_agent_release_atomicrmw( ; ; GFX11-CU-LABEL: local_agent_release_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1890,10 +2424,14 @@ define amdgpu_kernel void @local_agent_release_atomicrmw( ; ; GFX12-WGP-LABEL: local_agent_release_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 @@ -1904,10 +2442,14 @@ define amdgpu_kernel void @local_agent_release_atomicrmw( ; ; GFX12-CU-LABEL: local_agent_release_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 @@ -1919,10 +2461,14 @@ define amdgpu_kernel void @local_agent_release_atomicrmw( ; GFX1250-LABEL: local_agent_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -1938,11 +2484,13 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw( ; GFX6-LABEL: local_agent_acq_rel_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 @@ -1951,11 +2499,15 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw( ; ; GFX7-LABEL: local_agent_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 @@ -1964,10 +2516,14 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw( ; ; GFX10-WGP-LABEL: local_agent_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1978,10 +2534,14 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw( ; ; GFX10-CU-LABEL: local_agent_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1991,11 +2551,15 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_agent_acq_rel_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 @@ -2004,10 +2568,14 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 @@ -2016,10 +2584,14 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_agent_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 @@ -2028,10 +2600,14 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: local_agent_acq_rel_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 @@ -2040,10 +2616,14 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: local_agent_acq_rel_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 @@ -2052,10 +2632,14 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw( ; ; GFX11-WGP-LABEL: local_agent_acq_rel_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2066,10 +2650,14 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw( ; ; GFX11-CU-LABEL: local_agent_acq_rel_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2079,10 +2667,14 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw( ; ; GFX12-WGP-LABEL: local_agent_acq_rel_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 @@ -2095,10 +2687,14 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw( ; ; GFX12-CU-LABEL: local_agent_acq_rel_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 @@ -2111,10 +2707,14 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw( ; GFX1250-LABEL: local_agent_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -2131,11 +2731,13 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw( ; GFX6-LABEL: local_agent_seq_cst_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 @@ -2144,11 +2746,15 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw( ; ; GFX7-LABEL: local_agent_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 @@ -2157,10 +2763,14 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw( ; ; GFX10-WGP-LABEL: local_agent_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2171,10 +2781,14 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw( ; ; GFX10-CU-LABEL: local_agent_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2184,11 +2798,15 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_agent_seq_cst_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 @@ -2197,10 +2815,14 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 @@ -2209,10 +2831,14 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_agent_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 @@ -2221,10 +2847,14 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: local_agent_seq_cst_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 @@ -2233,10 +2863,14 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: local_agent_seq_cst_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 @@ -2245,10 +2879,14 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw( ; ; GFX11-WGP-LABEL: local_agent_seq_cst_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2259,10 +2897,14 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw( ; ; GFX11-CU-LABEL: local_agent_seq_cst_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2272,10 +2914,14 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw( ; ; GFX12-WGP-LABEL: local_agent_seq_cst_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 @@ -2288,10 +2934,14 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw( ; ; GFX12-CU-LABEL: local_agent_seq_cst_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 @@ -2304,10 +2954,14 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw( ; GFX1250-LABEL: local_agent_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -2324,11 +2978,13 @@ define amdgpu_kernel void @local_agent_acquire_ret_atomicrmw( ; GFX6-LABEL: local_agent_acquire_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -2340,6 +2996,10 @@ define amdgpu_kernel void @local_agent_acquire_ret_atomicrmw( ; GFX7-LABEL: local_agent_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2355,6 +3015,10 @@ define amdgpu_kernel void @local_agent_acquire_ret_atomicrmw( ; GFX10-WGP-LABEL: local_agent_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -2369,6 +3033,10 @@ define amdgpu_kernel void @local_agent_acquire_ret_atomicrmw( ; GFX10-CU-LABEL: local_agent_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -2382,6 +3050,10 @@ define amdgpu_kernel void @local_agent_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: local_agent_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -2397,6 +3069,10 @@ define amdgpu_kernel void @local_agent_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: local_agent_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -2410,6 +3086,10 @@ define amdgpu_kernel void @local_agent_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-LABEL: local_agent_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -2424,6 +3104,10 @@ define amdgpu_kernel void @local_agent_acquire_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: local_agent_acquire_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 @@ -2437,6 +3121,10 @@ define amdgpu_kernel void @local_agent_acquire_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: local_agent_acquire_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 @@ -2451,6 +3139,10 @@ define amdgpu_kernel void @local_agent_acquire_ret_atomicrmw( ; GFX11-WGP-LABEL: local_agent_acquire_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -2465,6 +3157,10 @@ define amdgpu_kernel void @local_agent_acquire_ret_atomicrmw( ; GFX11-CU-LABEL: local_agent_acquire_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2478,6 +3174,10 @@ define amdgpu_kernel void @local_agent_acquire_ret_atomicrmw( ; GFX12-WGP-LABEL: local_agent_acquire_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -2492,6 +3192,10 @@ define amdgpu_kernel void @local_agent_acquire_ret_atomicrmw( ; GFX12-CU-LABEL: local_agent_acquire_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2506,6 +3210,10 @@ define amdgpu_kernel void @local_agent_acquire_ret_atomicrmw( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 @@ -2526,11 +3234,13 @@ define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw( ; GFX6-LABEL: local_agent_acq_rel_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 @@ -2543,6 +3253,10 @@ define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw( ; GFX7-LABEL: local_agent_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2559,6 +3273,10 @@ define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw( ; GFX10-WGP-LABEL: local_agent_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -2575,6 +3293,10 @@ define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw( ; GFX10-CU-LABEL: local_agent_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -2590,6 +3312,10 @@ define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: local_agent_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -2606,6 +3332,10 @@ define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: local_agent_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -2620,6 +3350,10 @@ define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-LABEL: local_agent_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -2635,6 +3369,10 @@ define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: local_agent_acq_rel_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 @@ -2649,6 +3387,10 @@ define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: local_agent_acq_rel_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 @@ -2664,6 +3406,10 @@ define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw( ; GFX11-WGP-LABEL: local_agent_acq_rel_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -2680,6 +3426,10 @@ define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw( ; GFX11-CU-LABEL: local_agent_acq_rel_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2695,6 +3445,10 @@ define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw( ; GFX12-WGP-LABEL: local_agent_acq_rel_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -2713,6 +3467,10 @@ define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw( ; GFX12-CU-LABEL: local_agent_acq_rel_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2731,6 +3489,10 @@ define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 @@ -2753,11 +3515,13 @@ define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw( ; GFX6-LABEL: local_agent_seq_cst_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 @@ -2770,6 +3534,10 @@ define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw( ; GFX7-LABEL: local_agent_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2786,6 +3554,10 @@ define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw( ; GFX10-WGP-LABEL: local_agent_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -2802,6 +3574,10 @@ define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw( ; GFX10-CU-LABEL: local_agent_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -2817,6 +3593,10 @@ define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: local_agent_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -2833,6 +3613,10 @@ define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: local_agent_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -2847,6 +3631,10 @@ define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-LABEL: local_agent_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -2862,6 +3650,10 @@ define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: local_agent_seq_cst_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 @@ -2876,6 +3668,10 @@ define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: local_agent_seq_cst_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 @@ -2891,6 +3687,10 @@ define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw( ; GFX11-WGP-LABEL: local_agent_seq_cst_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -2907,6 +3707,10 @@ define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw( ; GFX11-CU-LABEL: local_agent_seq_cst_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2922,6 +3726,10 @@ define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw( ; GFX12-WGP-LABEL: local_agent_seq_cst_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -2940,6 +3748,10 @@ define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw( ; GFX12-CU-LABEL: local_agent_seq_cst_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2958,6 +3770,10 @@ define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 @@ -2980,12 +3796,16 @@ define amdgpu_kernel void @local_agent_monotonic_monotonic_cmpxchg( ; GFX6-LABEL: local_agent_monotonic_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -2993,12 +3813,18 @@ define amdgpu_kernel void @local_agent_monotonic_monotonic_cmpxchg( ; ; GFX7-LABEL: local_agent_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3006,11 +3832,17 @@ define amdgpu_kernel void @local_agent_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3018,11 +3850,17 @@ define amdgpu_kernel void @local_agent_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3030,12 +3868,18 @@ define amdgpu_kernel void @local_agent_monotonic_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3043,11 +3887,17 @@ define amdgpu_kernel void @local_agent_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3055,11 +3905,17 @@ define amdgpu_kernel void @local_agent_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3067,11 +3923,17 @@ define amdgpu_kernel void @local_agent_monotonic_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_agent_monotonic_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3079,11 +3941,17 @@ define amdgpu_kernel void @local_agent_monotonic_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_agent_monotonic_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3091,48 +3959,72 @@ define amdgpu_kernel void @local_agent_monotonic_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_agent_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_agent_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_agent_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -3140,12 +4032,18 @@ define amdgpu_kernel void @local_agent_monotonic_monotonic_cmpxchg( ; GFX1250-LABEL: local_agent_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -3160,12 +4058,16 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_cmpxchg( ; GFX6-LABEL: local_agent_acquire_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3174,12 +4076,18 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_cmpxchg( ; ; GFX7-LABEL: local_agent_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3188,11 +4096,17 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3202,11 +4116,17 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3215,12 +4135,18 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3229,11 +4155,17 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3242,11 +4174,17 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3255,11 +4193,17 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_agent_acquire_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3268,11 +4212,17 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_agent_acquire_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3281,12 +4231,18 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -3295,12 +4251,18 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: local_agent_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -3308,12 +4270,18 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: local_agent_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0 @@ -3322,12 +4290,18 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: local_agent_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 @@ -3336,12 +4310,18 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_cmpxchg( ; GFX1250-LABEL: local_agent_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -3357,12 +4337,16 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg( ; GFX6-LABEL: local_agent_release_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -3371,12 +4355,18 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg( ; ; GFX7-LABEL: local_agent_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -3385,11 +4375,17 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3399,11 +4395,17 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3413,12 +4415,18 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -3427,11 +4435,17 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3440,11 +4454,17 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3453,11 +4473,17 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_agent_release_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3466,11 +4492,17 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_agent_release_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3479,12 +4511,18 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3493,12 +4531,18 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: local_agent_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3507,12 +4551,18 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: local_agent_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 @@ -3523,12 +4573,18 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: local_agent_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 @@ -3540,12 +4596,18 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg( ; GFX1250-LABEL: local_agent_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -3562,12 +4624,16 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg( ; GFX6-LABEL: local_agent_acq_rel_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -3577,12 +4643,18 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg( ; ; GFX7-LABEL: local_agent_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -3592,11 +4664,17 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3608,11 +4686,17 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3623,12 +4707,18 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -3638,11 +4728,17 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3652,11 +4748,17 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3666,11 +4768,17 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_agent_acq_rel_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3680,11 +4788,17 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_agent_acq_rel_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3694,12 +4808,18 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3710,12 +4830,18 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: local_agent_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3725,12 +4851,18 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: local_agent_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 @@ -3743,12 +4875,18 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: local_agent_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 @@ -3761,12 +4899,18 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg( ; GFX1250-LABEL: local_agent_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -3784,12 +4928,16 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg( ; GFX6-LABEL: local_agent_seq_cst_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -3799,12 +4947,18 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg( ; ; GFX7-LABEL: local_agent_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -3814,11 +4968,17 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3830,11 +4990,17 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3845,12 +5011,18 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -3860,11 +5032,17 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3874,11 +5052,17 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3888,11 +5072,17 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_agent_seq_cst_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3902,11 +5092,17 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_agent_seq_cst_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3916,12 +5112,18 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3932,12 +5134,18 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: local_agent_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3947,12 +5155,18 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: local_agent_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 @@ -3965,12 +5179,18 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: local_agent_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 @@ -3983,12 +5203,18 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg( ; GFX1250-LABEL: local_agent_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -4006,12 +5232,16 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_cmpxchg( ; GFX6-LABEL: local_agent_monotonic_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4020,12 +5250,18 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_cmpxchg( ; ; GFX7-LABEL: local_agent_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4034,11 +5270,17 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4048,11 +5290,17 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4061,12 +5309,18 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4075,11 +5329,17 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4088,11 +5348,17 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4101,11 +5367,17 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_agent_monotonic_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4114,11 +5386,17 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_agent_monotonic_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4127,12 +5405,18 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4141,12 +5425,18 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_cmpxchg( ; ; GFX11-CU-LABEL: local_agent_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4154,12 +5444,18 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: local_agent_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0 @@ -4168,12 +5464,18 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_cmpxchg( ; ; GFX12-CU-LABEL: local_agent_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 @@ -4182,12 +5484,18 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_cmpxchg( ; GFX1250-LABEL: local_agent_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -4203,12 +5511,16 @@ define amdgpu_kernel void @local_agent_acquire_acquire_cmpxchg( ; GFX6-LABEL: local_agent_acquire_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4217,12 +5529,18 @@ define amdgpu_kernel void @local_agent_acquire_acquire_cmpxchg( ; ; GFX7-LABEL: local_agent_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4231,11 +5549,17 @@ define amdgpu_kernel void @local_agent_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4245,11 +5569,17 @@ define amdgpu_kernel void @local_agent_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4258,12 +5588,18 @@ define amdgpu_kernel void @local_agent_acquire_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4272,11 +5608,17 @@ define amdgpu_kernel void @local_agent_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4285,11 +5627,17 @@ define amdgpu_kernel void @local_agent_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4298,11 +5646,17 @@ define amdgpu_kernel void @local_agent_acquire_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_agent_acquire_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4311,11 +5665,17 @@ define amdgpu_kernel void @local_agent_acquire_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_agent_acquire_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4324,12 +5684,18 @@ define amdgpu_kernel void @local_agent_acquire_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4338,12 +5704,18 @@ define amdgpu_kernel void @local_agent_acquire_acquire_cmpxchg( ; ; GFX11-CU-LABEL: local_agent_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4351,12 +5723,18 @@ define amdgpu_kernel void @local_agent_acquire_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: local_agent_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0 @@ -4365,12 +5743,18 @@ define amdgpu_kernel void @local_agent_acquire_acquire_cmpxchg( ; ; GFX12-CU-LABEL: local_agent_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 @@ -4379,12 +5763,18 @@ define amdgpu_kernel void @local_agent_acquire_acquire_cmpxchg( ; GFX1250-LABEL: local_agent_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -4400,12 +5790,16 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg( ; GFX6-LABEL: local_agent_release_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -4415,12 +5809,18 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg( ; ; GFX7-LABEL: local_agent_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -4430,11 +5830,17 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4446,11 +5852,17 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4461,12 +5873,18 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -4476,11 +5894,17 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4490,11 +5914,17 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4504,11 +5934,17 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_agent_release_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4518,11 +5954,17 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_agent_release_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4532,12 +5974,18 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4548,12 +5996,18 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg( ; ; GFX11-CU-LABEL: local_agent_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4563,12 +6017,18 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: local_agent_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 @@ -4581,12 +6041,18 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg( ; ; GFX12-CU-LABEL: local_agent_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 @@ -4599,12 +6065,18 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg( ; GFX1250-LABEL: local_agent_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -4622,12 +6094,16 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg( ; GFX6-LABEL: local_agent_acq_rel_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -4637,12 +6113,18 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg( ; ; GFX7-LABEL: local_agent_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -4652,11 +6134,17 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4668,11 +6156,17 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4683,12 +6177,18 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -4698,11 +6198,17 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4712,11 +6218,17 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4726,11 +6238,17 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_agent_acq_rel_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4740,11 +6258,17 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_agent_acq_rel_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4754,12 +6278,18 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4770,12 +6300,18 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg( ; ; GFX11-CU-LABEL: local_agent_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4785,12 +6321,18 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: local_agent_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 @@ -4803,12 +6345,18 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg( ; ; GFX12-CU-LABEL: local_agent_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 @@ -4821,12 +6369,18 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg( ; GFX1250-LABEL: local_agent_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -4844,12 +6398,16 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg( ; GFX6-LABEL: local_agent_seq_cst_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -4859,12 +6417,18 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg( ; ; GFX7-LABEL: local_agent_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -4874,11 +6438,17 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4890,11 +6460,17 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4905,12 +6481,18 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -4920,11 +6502,17 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4934,11 +6522,17 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4948,11 +6542,17 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_agent_seq_cst_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4962,11 +6562,17 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_agent_seq_cst_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4976,12 +6582,18 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4992,12 +6604,18 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg( ; ; GFX11-CU-LABEL: local_agent_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -5007,12 +6625,18 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: local_agent_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 @@ -5025,12 +6649,18 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg( ; ; GFX12-CU-LABEL: local_agent_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 @@ -5043,12 +6673,18 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg( ; GFX1250-LABEL: local_agent_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -5066,12 +6702,16 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg( ; GFX6-LABEL: local_agent_monotonic_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5081,12 +6721,18 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_agent_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -5096,11 +6742,17 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5112,11 +6764,17 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5127,12 +6785,18 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -5142,11 +6806,17 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5156,11 +6826,17 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5170,11 +6846,17 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_agent_monotonic_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5184,11 +6866,17 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_agent_monotonic_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5198,12 +6886,18 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -5214,12 +6908,18 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: local_agent_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -5229,12 +6929,18 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: local_agent_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 @@ -5247,12 +6953,18 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: local_agent_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 @@ -5265,12 +6977,18 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg( ; GFX1250-LABEL: local_agent_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -5288,12 +7006,16 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg( ; GFX6-LABEL: local_agent_acquire_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5303,12 +7025,18 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_agent_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -5318,11 +7046,17 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5334,11 +7068,17 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5349,12 +7089,18 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -5364,11 +7110,17 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5378,11 +7130,17 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5392,11 +7150,17 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_agent_acquire_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5406,11 +7170,17 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_agent_acquire_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5420,12 +7190,18 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -5436,12 +7212,18 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: local_agent_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -5451,12 +7233,18 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: local_agent_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 @@ -5469,12 +7257,18 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: local_agent_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 @@ -5487,12 +7281,18 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg( ; GFX1250-LABEL: local_agent_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -5510,12 +7310,16 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg( ; GFX6-LABEL: local_agent_release_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5525,12 +7329,18 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_agent_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -5540,11 +7350,17 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5556,11 +7372,17 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5571,12 +7393,18 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -5586,11 +7414,17 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5600,11 +7434,17 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5614,11 +7454,17 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_agent_release_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5628,11 +7474,17 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_agent_release_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5642,12 +7494,18 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -5658,12 +7516,18 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: local_agent_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -5673,12 +7537,18 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: local_agent_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 @@ -5691,12 +7561,18 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: local_agent_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 @@ -5709,12 +7585,18 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg( ; GFX1250-LABEL: local_agent_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -5732,12 +7614,16 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg( ; GFX6-LABEL: local_agent_acq_rel_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5747,12 +7633,18 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_agent_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -5762,11 +7654,17 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5778,11 +7676,17 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5793,12 +7697,18 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -5808,11 +7718,17 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5822,11 +7738,17 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5836,11 +7758,17 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_agent_acq_rel_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5850,11 +7778,17 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_agent_acq_rel_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5864,12 +7798,18 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -5880,12 +7820,18 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: local_agent_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -5895,12 +7841,18 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: local_agent_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 @@ -5913,12 +7865,18 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: local_agent_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 @@ -5931,12 +7889,18 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg( ; GFX1250-LABEL: local_agent_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -5954,12 +7918,16 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg( ; GFX6-LABEL: local_agent_seq_cst_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5969,12 +7937,18 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_agent_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -5984,11 +7958,17 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6000,11 +7980,17 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6015,12 +8001,18 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -6030,11 +8022,17 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6044,11 +8042,17 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6058,11 +8062,17 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_agent_seq_cst_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6072,11 +8082,17 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_agent_seq_cst_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6086,12 +8102,18 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6102,12 +8124,18 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: local_agent_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6117,12 +8145,18 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: local_agent_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 @@ -6135,12 +8169,18 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: local_agent_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 @@ -6153,12 +8193,18 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg( ; GFX1250-LABEL: local_agent_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -6176,12 +8222,16 @@ define amdgpu_kernel void @local_agent_monotonic_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_agent_monotonic_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -6194,6 +8244,12 @@ define amdgpu_kernel void @local_agent_monotonic_monotonic_ret_cmpxchg( ; GFX7-LABEL: local_agent_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -6211,6 +8267,12 @@ define amdgpu_kernel void @local_agent_monotonic_monotonic_ret_cmpxchg( ; GFX10-WGP-LABEL: local_agent_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -6226,6 +8288,12 @@ define amdgpu_kernel void @local_agent_monotonic_monotonic_ret_cmpxchg( ; GFX10-CU-LABEL: local_agent_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6241,6 +8309,12 @@ define amdgpu_kernel void @local_agent_monotonic_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_agent_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -6258,6 +8332,12 @@ define amdgpu_kernel void @local_agent_monotonic_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_agent_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6273,6 +8353,12 @@ define amdgpu_kernel void @local_agent_monotonic_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_agent_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6288,6 +8374,12 @@ define amdgpu_kernel void @local_agent_monotonic_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_agent_monotonic_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6303,6 +8395,12 @@ define amdgpu_kernel void @local_agent_monotonic_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_agent_monotonic_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6318,6 +8416,12 @@ define amdgpu_kernel void @local_agent_monotonic_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: local_agent_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -6333,6 +8437,12 @@ define amdgpu_kernel void @local_agent_monotonic_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: local_agent_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6348,6 +8458,12 @@ define amdgpu_kernel void @local_agent_monotonic_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_agent_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -6363,6 +8479,12 @@ define amdgpu_kernel void @local_agent_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_agent_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -6379,6 +8501,12 @@ define amdgpu_kernel void @local_agent_monotonic_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -6403,12 +8531,16 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_agent_acquire_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -6421,6 +8553,12 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_ret_cmpxchg( ; GFX7-LABEL: local_agent_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -6438,6 +8576,12 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_ret_cmpxchg( ; GFX10-WGP-LABEL: local_agent_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -6454,6 +8598,12 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_ret_cmpxchg( ; GFX10-CU-LABEL: local_agent_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6469,6 +8619,12 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_agent_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -6486,6 +8642,12 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_agent_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6501,6 +8663,12 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_agent_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6517,6 +8685,12 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_agent_acquire_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6532,6 +8706,12 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_agent_acquire_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6548,6 +8728,12 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: local_agent_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -6564,6 +8750,12 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: local_agent_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6579,6 +8771,12 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_agent_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -6595,6 +8793,12 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_agent_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -6611,6 +8815,12 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -6635,12 +8845,16 @@ define amdgpu_kernel void @local_agent_release_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_agent_release_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6654,6 +8868,12 @@ define amdgpu_kernel void @local_agent_release_monotonic_ret_cmpxchg( ; GFX7-LABEL: local_agent_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -6672,6 +8892,12 @@ define amdgpu_kernel void @local_agent_release_monotonic_ret_cmpxchg( ; GFX10-WGP-LABEL: local_agent_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -6689,6 +8915,12 @@ define amdgpu_kernel void @local_agent_release_monotonic_ret_cmpxchg( ; GFX10-CU-LABEL: local_agent_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6706,6 +8938,12 @@ define amdgpu_kernel void @local_agent_release_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_agent_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -6724,6 +8962,12 @@ define amdgpu_kernel void @local_agent_release_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_agent_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6740,6 +8984,12 @@ define amdgpu_kernel void @local_agent_release_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_agent_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6756,6 +9006,12 @@ define amdgpu_kernel void @local_agent_release_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_agent_release_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6772,6 +9028,12 @@ define amdgpu_kernel void @local_agent_release_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_agent_release_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6788,6 +9050,12 @@ define amdgpu_kernel void @local_agent_release_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: local_agent_release_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -6805,6 +9073,12 @@ define amdgpu_kernel void @local_agent_release_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: local_agent_release_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6822,6 +9096,12 @@ define amdgpu_kernel void @local_agent_release_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_agent_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -6841,6 +9121,12 @@ define amdgpu_kernel void @local_agent_release_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_agent_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -6861,6 +9147,12 @@ define amdgpu_kernel void @local_agent_release_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -6887,12 +9179,16 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6906,6 +9202,12 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX7-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -6924,6 +9226,12 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX10-WGP-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -6942,6 +9250,12 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX10-CU-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6959,6 +9273,12 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -6977,6 +9297,12 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6993,6 +9319,12 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7010,6 +9342,12 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7026,6 +9364,12 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7043,6 +9387,12 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7061,6 +9411,12 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -7078,6 +9434,12 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -7098,6 +9460,12 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -7118,6 +9486,12 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -7144,12 +9518,16 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7163,6 +9541,12 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX7-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -7181,6 +9565,12 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX10-WGP-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7199,6 +9589,12 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX10-CU-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -7216,6 +9612,12 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -7234,6 +9636,12 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7250,6 +9658,12 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7267,6 +9681,12 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7283,6 +9703,12 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7300,6 +9726,12 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7318,6 +9750,12 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -7335,6 +9773,12 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -7355,6 +9799,12 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -7375,6 +9825,12 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -7401,12 +9857,16 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_ret_cmpxchg( ; GFX6-LABEL: local_agent_monotonic_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -7419,6 +9879,12 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_ret_cmpxchg( ; GFX7-LABEL: local_agent_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -7436,6 +9902,12 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: local_agent_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7452,6 +9924,12 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_ret_cmpxchg( ; GFX10-CU-LABEL: local_agent_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -7467,6 +9945,12 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_agent_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -7484,6 +9968,12 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_agent_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7499,6 +9989,12 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_agent_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7515,6 +10011,12 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_agent_monotonic_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7530,6 +10032,12 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_agent_monotonic_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7546,6 +10054,12 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: local_agent_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7562,6 +10076,12 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: local_agent_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -7577,6 +10097,12 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_agent_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -7593,6 +10119,12 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_agent_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -7609,6 +10141,12 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -7633,12 +10171,16 @@ define amdgpu_kernel void @local_agent_acquire_acquire_ret_cmpxchg( ; GFX6-LABEL: local_agent_acquire_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -7651,6 +10193,12 @@ define amdgpu_kernel void @local_agent_acquire_acquire_ret_cmpxchg( ; GFX7-LABEL: local_agent_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -7668,6 +10216,12 @@ define amdgpu_kernel void @local_agent_acquire_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: local_agent_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7684,6 +10238,12 @@ define amdgpu_kernel void @local_agent_acquire_acquire_ret_cmpxchg( ; GFX10-CU-LABEL: local_agent_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -7699,6 +10259,12 @@ define amdgpu_kernel void @local_agent_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_agent_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -7716,6 +10282,12 @@ define amdgpu_kernel void @local_agent_acquire_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_agent_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7731,6 +10303,12 @@ define amdgpu_kernel void @local_agent_acquire_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_agent_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7747,6 +10325,12 @@ define amdgpu_kernel void @local_agent_acquire_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_agent_acquire_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7762,6 +10346,12 @@ define amdgpu_kernel void @local_agent_acquire_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_agent_acquire_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7778,6 +10368,12 @@ define amdgpu_kernel void @local_agent_acquire_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: local_agent_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7794,6 +10390,12 @@ define amdgpu_kernel void @local_agent_acquire_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: local_agent_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -7809,6 +10411,12 @@ define amdgpu_kernel void @local_agent_acquire_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_agent_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -7825,6 +10433,12 @@ define amdgpu_kernel void @local_agent_acquire_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_agent_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -7841,6 +10455,12 @@ define amdgpu_kernel void @local_agent_acquire_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -7865,12 +10485,16 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg( ; GFX6-LABEL: local_agent_release_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7884,6 +10508,12 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg( ; GFX7-LABEL: local_agent_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -7902,6 +10532,12 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: local_agent_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7920,6 +10556,12 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg( ; GFX10-CU-LABEL: local_agent_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -7937,6 +10579,12 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_agent_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -7955,6 +10603,12 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_agent_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7971,6 +10625,12 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_agent_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7988,6 +10648,12 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_agent_release_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8004,6 +10670,12 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_agent_release_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8021,6 +10693,12 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: local_agent_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -8039,6 +10717,12 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: local_agent_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -8056,6 +10740,12 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_agent_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -8076,6 +10766,12 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_agent_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -8096,6 +10792,12 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -8122,12 +10824,16 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg( ; GFX6-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -8141,6 +10847,12 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg( ; GFX7-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -8159,6 +10871,12 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -8177,6 +10895,12 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg( ; GFX10-CU-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -8194,6 +10918,12 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -8212,6 +10942,12 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8228,6 +10964,12 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8245,6 +10987,12 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8261,6 +11009,12 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8278,6 +11032,12 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -8296,6 +11056,12 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -8313,6 +11079,12 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -8333,6 +11105,12 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -8353,6 +11131,12 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -8379,12 +11163,16 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg( ; GFX6-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -8398,6 +11186,12 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg( ; GFX7-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -8416,6 +11210,12 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -8434,6 +11234,12 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg( ; GFX10-CU-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -8451,6 +11257,12 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -8469,6 +11281,12 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8485,6 +11303,12 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8502,6 +11326,12 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8518,6 +11348,12 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8535,6 +11371,12 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -8553,6 +11395,12 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -8570,6 +11418,12 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -8590,6 +11444,12 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -8610,6 +11470,12 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -8636,12 +11502,16 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -8655,6 +11525,12 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX7-LABEL: local_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -8673,6 +11549,12 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX10-WGP-LABEL: local_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -8691,6 +11573,12 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX10-CU-LABEL: local_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -8708,6 +11596,12 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_agent_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -8726,6 +11620,12 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8742,6 +11642,12 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8759,6 +11665,12 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8775,6 +11687,12 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8792,6 +11710,12 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: local_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -8810,6 +11734,12 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: local_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -8827,6 +11757,12 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -8847,6 +11783,12 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -8867,6 +11809,12 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -8893,12 +11841,16 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_agent_acquire_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -8912,6 +11864,12 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg( ; GFX7-LABEL: local_agent_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -8930,6 +11888,12 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg( ; GFX10-WGP-LABEL: local_agent_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -8948,6 +11912,12 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg( ; GFX10-CU-LABEL: local_agent_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -8965,6 +11935,12 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_agent_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -8983,6 +11959,12 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_agent_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8999,6 +11981,12 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_agent_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9016,6 +12004,12 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_agent_acquire_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9032,6 +12026,12 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_agent_acquire_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9049,6 +12049,12 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: local_agent_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -9067,6 +12073,12 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: local_agent_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -9084,6 +12096,12 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_agent_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -9104,6 +12122,12 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_agent_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -9124,6 +12148,12 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -9150,12 +12180,16 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_agent_release_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -9169,6 +12203,12 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg( ; GFX7-LABEL: local_agent_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -9187,6 +12227,12 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg( ; GFX10-WGP-LABEL: local_agent_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -9205,6 +12251,12 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg( ; GFX10-CU-LABEL: local_agent_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -9222,6 +12274,12 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_agent_release_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -9240,6 +12298,12 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_agent_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9256,6 +12320,12 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_agent_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9273,6 +12343,12 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_agent_release_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9289,6 +12365,12 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_agent_release_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9306,6 +12388,12 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: local_agent_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -9324,6 +12412,12 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: local_agent_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -9341,6 +12435,12 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_agent_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -9361,6 +12461,12 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_agent_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -9381,6 +12487,12 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -9407,12 +12519,16 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -9426,6 +12542,12 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-LABEL: local_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -9444,6 +12566,12 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-WGP-LABEL: local_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -9462,6 +12590,12 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-CU-LABEL: local_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -9479,6 +12613,12 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_agent_acq_rel_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -9497,6 +12637,12 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9513,6 +12659,12 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9530,6 +12682,12 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9546,6 +12704,12 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9563,6 +12727,12 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: local_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -9581,6 +12751,12 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: local_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -9598,6 +12774,12 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -9618,6 +12800,12 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -9638,6 +12826,12 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -9664,12 +12858,16 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -9683,6 +12881,12 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -9701,6 +12905,12 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-WGP-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -9719,6 +12929,12 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-CU-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -9736,6 +12952,12 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -9754,6 +12976,12 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9770,6 +12998,12 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9787,6 +13021,12 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9803,6 +13043,12 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9820,6 +13066,12 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -9838,6 +13090,12 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -9855,6 +13113,12 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -9875,6 +13139,12 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -9895,6 +13165,12 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -9936,135 +13212,169 @@ define amdgpu_kernel void @local_agent_one_as_unordered_load( ; ; GFX7-LABEL: local_agent_one_as_unordered_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: ds_read_b32 v1, v0 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_one_as_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_agent_one_as_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_unordered_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_agent_one_as_unordered_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_one_as_unordered_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_agent_one_as_unordered_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_agent_one_as_unordered_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_agent_one_as_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: ds_load_b32 v1, v0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: ds_store_b32 v0, v1 @@ -10072,11 +13382,15 @@ define amdgpu_kernel void @local_agent_one_as_unordered_load( ; ; GFX12-CU-LABEL: local_agent_one_as_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 @@ -10085,11 +13399,15 @@ define amdgpu_kernel void @local_agent_one_as_unordered_load( ; GFX1250-LABEL: local_agent_one_as_unordered_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: ds_store_b32 v0, v1 @@ -10120,135 +13438,169 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_load( ; ; GFX7-LABEL: local_agent_one_as_monotonic_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: ds_read_b32 v1, v0 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_one_as_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_agent_one_as_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_monotonic_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_one_as_monotonic_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_agent_one_as_monotonic_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_agent_one_as_monotonic_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_agent_one_as_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: ds_load_b32 v1, v0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: ds_store_b32 v0, v1 @@ -10256,11 +13608,15 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_load( ; ; GFX12-CU-LABEL: local_agent_one_as_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 @@ -10269,11 +13625,15 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_load( ; GFX1250-LABEL: local_agent_one_as_monotonic_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: ds_store_b32 v0, v1 @@ -10304,135 +13664,169 @@ define amdgpu_kernel void @local_agent_one_as_acquire_load( ; ; GFX7-LABEL: local_agent_one_as_acquire_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: ds_read_b32 v1, v0 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_one_as_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_agent_one_as_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_acquire_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_one_as_acquire_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_agent_one_as_acquire_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_agent_one_as_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_agent_one_as_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: ds_load_b32 v1, v0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: ds_store_b32 v0, v1 @@ -10440,11 +13834,15 @@ define amdgpu_kernel void @local_agent_one_as_acquire_load( ; ; GFX12-CU-LABEL: local_agent_one_as_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 @@ -10453,11 +13851,15 @@ define amdgpu_kernel void @local_agent_one_as_acquire_load( ; GFX1250-LABEL: local_agent_one_as_acquire_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: ds_store_b32 v0, v1 @@ -10488,135 +13890,169 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_load( ; ; GFX7-LABEL: local_agent_one_as_seq_cst_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: ds_read_b32 v1, v0 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_one_as_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_agent_one_as_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_seq_cst_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_one_as_seq_cst_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_agent_one_as_seq_cst_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_agent_one_as_seq_cst_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_agent_one_as_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: ds_load_b32 v1, v0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: ds_store_b32 v0, v1 @@ -10624,11 +14060,15 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_load( ; ; GFX12-CU-LABEL: local_agent_one_as_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 @@ -10637,11 +14077,15 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_load( ; GFX1250-LABEL: local_agent_one_as_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: ds_store_b32 v0, v1 @@ -10656,12 +14100,14 @@ entry: define amdgpu_kernel void @local_agent_one_as_unordered_store( ; GFX6-LABEL: local_agent_one_as_unordered_store: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm @@ -10669,6 +14115,10 @@ define amdgpu_kernel void @local_agent_one_as_unordered_store( ; GFX7-LABEL: local_agent_one_as_unordered_store: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10680,6 +14130,10 @@ define amdgpu_kernel void @local_agent_one_as_unordered_store( ; GFX10-WGP-LABEL: local_agent_one_as_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 @@ -10690,6 +14144,10 @@ define amdgpu_kernel void @local_agent_one_as_unordered_store( ; GFX10-CU-LABEL: local_agent_one_as_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 @@ -10700,6 +14158,10 @@ define amdgpu_kernel void @local_agent_one_as_unordered_store( ; SKIP-CACHE-INV-LABEL: local_agent_one_as_unordered_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -10711,6 +14173,10 @@ define amdgpu_kernel void @local_agent_one_as_unordered_store( ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -10721,6 +14187,10 @@ define amdgpu_kernel void @local_agent_one_as_unordered_store( ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -10731,6 +14201,10 @@ define amdgpu_kernel void @local_agent_one_as_unordered_store( ; GFX942-NOTTGSPLIT-LABEL: local_agent_one_as_unordered_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -10741,6 +14215,10 @@ define amdgpu_kernel void @local_agent_one_as_unordered_store( ; GFX942-TGSPLIT-LABEL: local_agent_one_as_unordered_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -10751,6 +14229,10 @@ define amdgpu_kernel void @local_agent_one_as_unordered_store( ; GFX11-WGP-LABEL: local_agent_one_as_unordered_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -10761,6 +14243,10 @@ define amdgpu_kernel void @local_agent_one_as_unordered_store( ; GFX11-CU-LABEL: local_agent_one_as_unordered_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -10771,6 +14257,10 @@ define amdgpu_kernel void @local_agent_one_as_unordered_store( ; GFX12-WGP-LABEL: local_agent_one_as_unordered_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -10781,6 +14271,10 @@ define amdgpu_kernel void @local_agent_one_as_unordered_store( ; GFX12-CU-LABEL: local_agent_one_as_unordered_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -10792,6 +14286,10 @@ define amdgpu_kernel void @local_agent_one_as_unordered_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 @@ -10807,12 +14305,14 @@ entry: define amdgpu_kernel void @local_agent_one_as_monotonic_store( ; GFX6-LABEL: local_agent_one_as_monotonic_store: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm @@ -10820,6 +14320,10 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_store( ; GFX7-LABEL: local_agent_one_as_monotonic_store: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10831,6 +14335,10 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_store( ; GFX10-WGP-LABEL: local_agent_one_as_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 @@ -10841,6 +14349,10 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_store( ; GFX10-CU-LABEL: local_agent_one_as_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 @@ -10851,6 +14363,10 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_store( ; SKIP-CACHE-INV-LABEL: local_agent_one_as_monotonic_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -10862,6 +14378,10 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_store( ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -10872,6 +14392,10 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_store( ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -10882,6 +14406,10 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_store( ; GFX942-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -10892,6 +14420,10 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_store( ; GFX942-TGSPLIT-LABEL: local_agent_one_as_monotonic_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -10902,6 +14434,10 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_store( ; GFX11-WGP-LABEL: local_agent_one_as_monotonic_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -10912,6 +14448,10 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_store( ; GFX11-CU-LABEL: local_agent_one_as_monotonic_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -10922,6 +14462,10 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_store( ; GFX12-WGP-LABEL: local_agent_one_as_monotonic_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -10932,6 +14476,10 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_store( ; GFX12-CU-LABEL: local_agent_one_as_monotonic_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -10943,6 +14491,10 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 @@ -10958,12 +14510,14 @@ entry: define amdgpu_kernel void @local_agent_one_as_release_store( ; GFX6-LABEL: local_agent_one_as_release_store: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm @@ -10971,6 +14525,10 @@ define amdgpu_kernel void @local_agent_one_as_release_store( ; GFX7-LABEL: local_agent_one_as_release_store: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10982,6 +14540,10 @@ define amdgpu_kernel void @local_agent_one_as_release_store( ; GFX10-WGP-LABEL: local_agent_one_as_release_store: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 @@ -10992,6 +14554,10 @@ define amdgpu_kernel void @local_agent_one_as_release_store( ; GFX10-CU-LABEL: local_agent_one_as_release_store: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 @@ -11002,6 +14568,10 @@ define amdgpu_kernel void @local_agent_one_as_release_store( ; SKIP-CACHE-INV-LABEL: local_agent_one_as_release_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -11013,6 +14583,10 @@ define amdgpu_kernel void @local_agent_one_as_release_store( ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -11023,6 +14597,10 @@ define amdgpu_kernel void @local_agent_one_as_release_store( ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -11033,6 +14611,10 @@ define amdgpu_kernel void @local_agent_one_as_release_store( ; GFX942-NOTTGSPLIT-LABEL: local_agent_one_as_release_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -11043,6 +14625,10 @@ define amdgpu_kernel void @local_agent_one_as_release_store( ; GFX942-TGSPLIT-LABEL: local_agent_one_as_release_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -11053,6 +14639,10 @@ define amdgpu_kernel void @local_agent_one_as_release_store( ; GFX11-WGP-LABEL: local_agent_one_as_release_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -11063,6 +14653,10 @@ define amdgpu_kernel void @local_agent_one_as_release_store( ; GFX11-CU-LABEL: local_agent_one_as_release_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -11073,6 +14667,10 @@ define amdgpu_kernel void @local_agent_one_as_release_store( ; GFX12-WGP-LABEL: local_agent_one_as_release_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -11083,6 +14681,10 @@ define amdgpu_kernel void @local_agent_one_as_release_store( ; GFX12-CU-LABEL: local_agent_one_as_release_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -11094,6 +14696,10 @@ define amdgpu_kernel void @local_agent_one_as_release_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 @@ -11109,12 +14715,14 @@ entry: define amdgpu_kernel void @local_agent_one_as_seq_cst_store( ; GFX6-LABEL: local_agent_one_as_seq_cst_store: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm @@ -11122,6 +14730,10 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_store( ; GFX7-LABEL: local_agent_one_as_seq_cst_store: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11133,6 +14745,10 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_store( ; GFX10-WGP-LABEL: local_agent_one_as_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 @@ -11143,6 +14759,10 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_store( ; GFX10-CU-LABEL: local_agent_one_as_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 @@ -11153,6 +14773,10 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_store( ; SKIP-CACHE-INV-LABEL: local_agent_one_as_seq_cst_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -11164,6 +14788,10 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_store( ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -11174,6 +14802,10 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_store( ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -11184,6 +14816,10 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_store( ; GFX942-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -11194,6 +14830,10 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_store( ; GFX942-TGSPLIT-LABEL: local_agent_one_as_seq_cst_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -11204,6 +14844,10 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_store( ; GFX11-WGP-LABEL: local_agent_one_as_seq_cst_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -11214,6 +14858,10 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_store( ; GFX11-CU-LABEL: local_agent_one_as_seq_cst_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -11224,6 +14872,10 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_store( ; GFX12-WGP-LABEL: local_agent_one_as_seq_cst_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -11234,6 +14886,10 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_store( ; GFX12-CU-LABEL: local_agent_one_as_seq_cst_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -11245,6 +14901,10 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 @@ -11261,133 +14921,183 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_atomicrmw( ; GFX6-LABEL: local_agent_one_as_monotonic_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_one_as_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_one_as_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_agent_one_as_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_monotonic_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_one_as_monotonic_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_agent_one_as_monotonic_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_agent_one_as_monotonic_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_agent_one_as_monotonic_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_agent_one_as_monotonic_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm @@ -11395,10 +15105,14 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_atomicrmw( ; GFX1250-LABEL: local_agent_one_as_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s0 ; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX1250-NEXT: s_endpgm @@ -11412,133 +15126,183 @@ define amdgpu_kernel void @local_agent_one_as_acquire_atomicrmw( ; GFX6-LABEL: local_agent_one_as_acquire_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_one_as_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_one_as_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_agent_one_as_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_acquire_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_one_as_acquire_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_agent_one_as_acquire_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_agent_one_as_acquire_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_agent_one_as_acquire_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_agent_one_as_acquire_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm @@ -11546,10 +15310,14 @@ define amdgpu_kernel void @local_agent_one_as_acquire_atomicrmw( ; GFX1250-LABEL: local_agent_one_as_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s0 ; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX1250-NEXT: s_endpgm @@ -11563,133 +15331,183 @@ define amdgpu_kernel void @local_agent_one_as_release_atomicrmw( ; GFX6-LABEL: local_agent_one_as_release_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_one_as_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_one_as_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_agent_one_as_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_release_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_agent_one_as_release_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_one_as_release_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_agent_one_as_release_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_agent_one_as_release_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_agent_one_as_release_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_agent_one_as_release_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm @@ -11697,10 +15515,14 @@ define amdgpu_kernel void @local_agent_one_as_release_atomicrmw( ; GFX1250-LABEL: local_agent_one_as_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s0 ; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX1250-NEXT: s_endpgm @@ -11714,133 +15536,183 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_atomicrmw( ; GFX6-LABEL: local_agent_one_as_acq_rel_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_one_as_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_one_as_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_agent_one_as_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_acq_rel_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_one_as_acq_rel_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_agent_one_as_acq_rel_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_agent_one_as_acq_rel_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_agent_one_as_acq_rel_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_agent_one_as_acq_rel_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm @@ -11848,10 +15720,14 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_atomicrmw( ; GFX1250-LABEL: local_agent_one_as_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s0 ; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX1250-NEXT: s_endpgm @@ -11865,133 +15741,183 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_atomicrmw( ; GFX6-LABEL: local_agent_one_as_seq_cst_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_one_as_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_one_as_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_agent_one_as_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_seq_cst_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_one_as_seq_cst_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_agent_one_as_seq_cst_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_agent_one_as_seq_cst_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_agent_one_as_seq_cst_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_agent_one_as_seq_cst_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm @@ -11999,10 +15925,14 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_atomicrmw( ; GFX1250-LABEL: local_agent_one_as_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s0 ; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX1250-NEXT: s_endpgm @@ -12016,11 +15946,13 @@ define amdgpu_kernel void @local_agent_one_as_acquire_ret_atomicrmw( ; GFX6-LABEL: local_agent_one_as_acquire_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 ; GFX6-NEXT: s_mov_b32 m0, -1 @@ -12032,6 +15964,10 @@ define amdgpu_kernel void @local_agent_one_as_acquire_ret_atomicrmw( ; GFX7-LABEL: local_agent_one_as_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12047,6 +15983,10 @@ define amdgpu_kernel void @local_agent_one_as_acquire_ret_atomicrmw( ; GFX10-WGP-LABEL: local_agent_one_as_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -12060,6 +16000,10 @@ define amdgpu_kernel void @local_agent_one_as_acquire_ret_atomicrmw( ; GFX10-CU-LABEL: local_agent_one_as_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -12073,6 +16017,10 @@ define amdgpu_kernel void @local_agent_one_as_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: local_agent_one_as_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -12088,6 +16036,10 @@ define amdgpu_kernel void @local_agent_one_as_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -12101,6 +16053,10 @@ define amdgpu_kernel void @local_agent_one_as_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -12114,6 +16070,10 @@ define amdgpu_kernel void @local_agent_one_as_acquire_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 @@ -12127,6 +16087,10 @@ define amdgpu_kernel void @local_agent_one_as_acquire_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: local_agent_one_as_acquire_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 @@ -12140,6 +16104,10 @@ define amdgpu_kernel void @local_agent_one_as_acquire_ret_atomicrmw( ; GFX11-WGP-LABEL: local_agent_one_as_acquire_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -12153,6 +16121,10 @@ define amdgpu_kernel void @local_agent_one_as_acquire_ret_atomicrmw( ; GFX11-CU-LABEL: local_agent_one_as_acquire_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -12166,6 +16138,10 @@ define amdgpu_kernel void @local_agent_one_as_acquire_ret_atomicrmw( ; GFX12-WGP-LABEL: local_agent_one_as_acquire_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -12179,6 +16155,10 @@ define amdgpu_kernel void @local_agent_one_as_acquire_ret_atomicrmw( ; GFX12-CU-LABEL: local_agent_one_as_acquire_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -12193,6 +16173,10 @@ define amdgpu_kernel void @local_agent_one_as_acquire_ret_atomicrmw( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 @@ -12213,11 +16197,13 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_ret_atomicrmw( ; GFX6-LABEL: local_agent_one_as_acq_rel_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 ; GFX6-NEXT: s_mov_b32 m0, -1 @@ -12229,6 +16215,10 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_ret_atomicrmw( ; GFX7-LABEL: local_agent_one_as_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12244,6 +16234,10 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_ret_atomicrmw( ; GFX10-WGP-LABEL: local_agent_one_as_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -12257,6 +16251,10 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_ret_atomicrmw( ; GFX10-CU-LABEL: local_agent_one_as_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -12270,6 +16268,10 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: local_agent_one_as_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -12285,6 +16287,10 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -12298,6 +16304,10 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -12311,6 +16321,10 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 @@ -12324,6 +16338,10 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: local_agent_one_as_acq_rel_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 @@ -12337,6 +16355,10 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_ret_atomicrmw( ; GFX11-WGP-LABEL: local_agent_one_as_acq_rel_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -12350,6 +16372,10 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_ret_atomicrmw( ; GFX11-CU-LABEL: local_agent_one_as_acq_rel_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -12363,6 +16389,10 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_ret_atomicrmw( ; GFX12-WGP-LABEL: local_agent_one_as_acq_rel_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -12376,6 +16406,10 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-LABEL: local_agent_one_as_acq_rel_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -12390,6 +16424,10 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_ret_atomicrmw( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 @@ -12410,11 +16448,13 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_ret_atomicrmw( ; GFX6-LABEL: local_agent_one_as_seq_cst_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 ; GFX6-NEXT: s_mov_b32 m0, -1 @@ -12426,6 +16466,10 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_ret_atomicrmw( ; GFX7-LABEL: local_agent_one_as_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12441,6 +16485,10 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_ret_atomicrmw( ; GFX10-WGP-LABEL: local_agent_one_as_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -12454,6 +16502,10 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_ret_atomicrmw( ; GFX10-CU-LABEL: local_agent_one_as_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -12467,6 +16519,10 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: local_agent_one_as_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -12482,6 +16538,10 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -12495,6 +16555,10 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -12508,6 +16572,10 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 @@ -12521,6 +16589,10 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: local_agent_one_as_seq_cst_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 @@ -12534,6 +16606,10 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_ret_atomicrmw( ; GFX11-WGP-LABEL: local_agent_one_as_seq_cst_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -12547,6 +16623,10 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_ret_atomicrmw( ; GFX11-CU-LABEL: local_agent_one_as_seq_cst_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -12560,6 +16640,10 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_ret_atomicrmw( ; GFX12-WGP-LABEL: local_agent_one_as_seq_cst_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -12573,6 +16657,10 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-LABEL: local_agent_one_as_seq_cst_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -12587,6 +16675,10 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_ret_atomicrmw( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 @@ -12607,12 +16699,16 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_cmpxchg( ; GFX6-LABEL: local_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12620,12 +16716,18 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_cmpxchg( ; ; GFX7-LABEL: local_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12633,11 +16735,17 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12645,11 +16753,17 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12657,12 +16771,18 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12670,11 +16790,17 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12682,11 +16808,17 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12694,11 +16826,17 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12706,11 +16844,17 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12718,48 +16862,72 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -12767,12 +16935,18 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_cmpxchg( ; GFX1250-LABEL: local_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -12787,12 +16961,16 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_cmpxchg( ; GFX6-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12800,12 +16978,18 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_cmpxchg( ; ; GFX7-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12813,11 +16997,17 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12825,11 +17015,17 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12837,12 +17033,18 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12850,11 +17052,17 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12862,11 +17070,17 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12874,11 +17088,17 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12886,11 +17106,17 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12898,48 +17124,72 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -12947,12 +17197,18 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_cmpxchg( ; GFX1250-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -12967,12 +17223,16 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_cmpxchg( ; GFX6-LABEL: local_agent_one_as_release_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12980,12 +17240,18 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_cmpxchg( ; ; GFX7-LABEL: local_agent_one_as_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12993,11 +17259,17 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_one_as_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13005,11 +17277,17 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_one_as_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13017,12 +17295,18 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13030,11 +17314,17 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13042,11 +17332,17 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13054,11 +17350,17 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_agent_one_as_release_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13066,11 +17368,17 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_agent_one_as_release_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13078,48 +17386,72 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_one_as_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_agent_one_as_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_agent_one_as_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_agent_one_as_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -13127,12 +17459,18 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_cmpxchg( ; GFX1250-LABEL: local_agent_one_as_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -13147,12 +17485,16 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX6-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13160,12 +17502,18 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX7-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13173,11 +17521,17 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13185,11 +17539,17 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13197,12 +17557,18 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13210,11 +17576,17 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13222,11 +17594,17 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13234,11 +17612,17 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13246,11 +17630,17 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13258,48 +17648,72 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -13307,12 +17721,18 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX1250-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -13327,12 +17747,16 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX6-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13340,12 +17764,18 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX7-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13353,11 +17783,17 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13365,11 +17801,17 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13377,12 +17819,18 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13390,11 +17838,17 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13402,11 +17856,17 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13414,11 +17874,17 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13426,11 +17892,17 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13438,48 +17910,72 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -13487,12 +17983,18 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX1250-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -13507,12 +18009,16 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_cmpxchg( ; GFX6-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13520,12 +18026,18 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_cmpxchg( ; ; GFX7-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13533,11 +18045,17 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13545,11 +18063,17 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13557,12 +18081,18 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13570,11 +18100,17 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13582,11 +18118,17 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13594,11 +18136,17 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13606,11 +18154,17 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13618,48 +18172,72 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -13667,12 +18245,18 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_cmpxchg( ; GFX1250-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -13687,12 +18271,16 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_cmpxchg( ; GFX6-LABEL: local_agent_one_as_acquire_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13700,12 +18288,18 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_cmpxchg( ; ; GFX7-LABEL: local_agent_one_as_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13713,11 +18307,17 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_one_as_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13725,11 +18325,17 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_one_as_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13737,12 +18343,18 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13750,11 +18362,17 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13762,11 +18380,17 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13774,11 +18398,17 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13786,11 +18416,17 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_agent_one_as_acquire_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13798,48 +18434,72 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_one_as_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_agent_one_as_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_agent_one_as_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_agent_one_as_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -13847,12 +18507,18 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_cmpxchg( ; GFX1250-LABEL: local_agent_one_as_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -13867,12 +18533,16 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_cmpxchg( ; GFX6-LABEL: local_agent_one_as_release_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13880,12 +18550,18 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_cmpxchg( ; ; GFX7-LABEL: local_agent_one_as_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13893,11 +18569,17 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_one_as_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13905,11 +18587,17 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_one_as_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13917,12 +18605,18 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13930,11 +18624,17 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13942,11 +18642,17 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13954,11 +18660,17 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_agent_one_as_release_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13966,11 +18678,17 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_agent_one_as_release_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13978,48 +18696,72 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_one_as_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_agent_one_as_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_agent_one_as_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_agent_one_as_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -14027,12 +18769,18 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_cmpxchg( ; GFX1250-LABEL: local_agent_one_as_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -14047,12 +18795,16 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX6-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14060,12 +18812,18 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_cmpxchg( ; ; GFX7-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14073,11 +18831,17 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14085,11 +18849,17 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14097,12 +18867,18 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14110,11 +18886,17 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14122,11 +18904,17 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14134,11 +18922,17 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14146,11 +18940,17 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14158,48 +18958,72 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -14207,12 +19031,18 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX1250-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -14227,12 +19057,16 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX6-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14240,12 +19074,18 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_cmpxchg( ; ; GFX7-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14253,11 +19093,17 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14265,11 +19111,17 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14277,12 +19129,18 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14290,11 +19148,17 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14302,11 +19166,17 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14314,11 +19184,17 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14326,11 +19202,17 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14338,48 +19220,72 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -14387,12 +19293,18 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX1250-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -14407,12 +19319,16 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX6-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14420,12 +19336,18 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14433,11 +19355,17 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14445,11 +19373,17 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14457,12 +19391,18 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14470,11 +19410,17 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14482,11 +19428,17 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14494,11 +19446,17 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14506,11 +19464,17 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14518,48 +19482,72 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -14567,12 +19555,18 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX1250-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -14587,12 +19581,16 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX6-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14600,12 +19598,18 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14613,11 +19617,17 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14625,11 +19635,17 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14637,12 +19653,18 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14650,11 +19672,17 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14662,11 +19690,17 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14674,11 +19708,17 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14686,11 +19726,17 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14698,48 +19744,72 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -14747,12 +19817,18 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX1250-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -14767,12 +19843,16 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_cmpxchg( ; GFX6-LABEL: local_agent_one_as_release_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14780,12 +19860,18 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_agent_one_as_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14793,11 +19879,17 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_one_as_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14805,11 +19897,17 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_one_as_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14817,12 +19915,18 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14830,11 +19934,17 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14842,11 +19952,17 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14854,11 +19970,17 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_agent_one_as_release_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14866,11 +19988,17 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_agent_one_as_release_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14878,48 +20006,72 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_one_as_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_agent_one_as_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_agent_one_as_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_agent_one_as_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -14927,12 +20079,18 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_cmpxchg( ; GFX1250-LABEL: local_agent_one_as_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -14947,12 +20105,16 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX6-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14960,12 +20122,18 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14973,11 +20141,17 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14985,11 +20159,17 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14997,12 +20177,18 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -15010,11 +20196,17 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -15022,11 +20214,17 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -15034,11 +20232,17 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -15046,11 +20250,17 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -15058,48 +20268,72 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -15107,12 +20341,18 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX1250-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -15127,12 +20367,16 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX6-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -15140,12 +20384,18 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -15153,11 +20403,17 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -15165,11 +20421,17 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -15177,12 +20439,18 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -15190,11 +20458,17 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -15202,11 +20476,17 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -15214,11 +20494,17 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -15226,11 +20512,17 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -15238,48 +20530,72 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -15287,12 +20603,18 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX1250-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -15307,12 +20629,16 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -15325,6 +20651,12 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX7-LABEL: local_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -15342,6 +20674,12 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX10-WGP-LABEL: local_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -15357,6 +20695,12 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX10-CU-LABEL: local_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -15372,6 +20716,12 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -15389,6 +20739,12 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15404,6 +20760,12 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15419,6 +20781,12 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15434,6 +20802,12 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15449,6 +20823,12 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: local_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -15464,6 +20844,12 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: local_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -15479,6 +20865,12 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -15494,6 +20886,12 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -15510,6 +20908,12 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -15534,12 +20938,16 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -15552,6 +20960,12 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX7-LABEL: local_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -15569,6 +20983,12 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX10-WGP-LABEL: local_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -15584,6 +21004,12 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX10-CU-LABEL: local_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -15599,6 +21025,12 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_agent_one_as_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -15616,6 +21048,12 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15631,6 +21069,12 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15646,6 +21090,12 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15661,6 +21111,12 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15676,6 +21132,12 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: local_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -15691,6 +21153,12 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: local_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -15706,6 +21174,12 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -15721,6 +21195,12 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -15737,6 +21217,12 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -15761,12 +21247,16 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_agent_one_as_release_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -15779,6 +21269,12 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_ret_cmpxchg( ; GFX7-LABEL: local_agent_one_as_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -15796,6 +21292,12 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_ret_cmpxchg( ; GFX10-WGP-LABEL: local_agent_one_as_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -15811,6 +21313,12 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_ret_cmpxchg( ; GFX10-CU-LABEL: local_agent_one_as_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -15826,6 +21334,12 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_agent_one_as_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -15843,6 +21357,12 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15858,6 +21378,12 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15873,6 +21399,12 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_agent_one_as_release_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15888,6 +21420,12 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_agent_one_as_release_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15903,6 +21441,12 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: local_agent_one_as_release_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -15918,6 +21462,12 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: local_agent_one_as_release_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -15933,6 +21483,12 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_agent_one_as_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -15948,6 +21504,12 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_agent_one_as_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -15964,6 +21526,12 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -15988,12 +21556,16 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -16006,6 +21578,12 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX7-LABEL: local_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -16023,6 +21601,12 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX10-WGP-LABEL: local_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16038,6 +21622,12 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX10-CU-LABEL: local_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16053,6 +21643,12 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -16070,6 +21666,12 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16085,6 +21687,12 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16100,6 +21708,12 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16115,6 +21729,12 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16130,6 +21750,12 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: local_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16145,6 +21771,12 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: local_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16160,6 +21792,12 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -16175,6 +21813,12 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -16191,6 +21835,12 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -16215,12 +21865,16 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -16233,6 +21887,12 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX7-LABEL: local_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -16250,6 +21910,12 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX10-WGP-LABEL: local_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16265,6 +21931,12 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX10-CU-LABEL: local_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16280,6 +21952,12 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -16297,6 +21975,12 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16312,6 +21996,12 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16327,6 +22017,12 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16342,6 +22038,12 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16357,6 +22059,12 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: local_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16372,6 +22080,12 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: local_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16387,6 +22101,12 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -16402,6 +22122,12 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -16418,6 +22144,12 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -16442,12 +22174,16 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX6-LABEL: local_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -16460,6 +22196,12 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX7-LABEL: local_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -16477,6 +22219,12 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: local_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16492,6 +22240,12 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX10-CU-LABEL: local_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16507,6 +22261,12 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_agent_one_as_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -16524,6 +22284,12 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16539,6 +22305,12 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16554,6 +22326,12 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16569,6 +22347,12 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16584,6 +22368,12 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: local_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16599,6 +22389,12 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: local_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16614,6 +22410,12 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -16629,6 +22431,12 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -16645,6 +22453,12 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -16669,12 +22483,16 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX6-LABEL: local_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -16687,6 +22505,12 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX7-LABEL: local_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -16704,6 +22528,12 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: local_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16719,6 +22549,12 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX10-CU-LABEL: local_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16734,6 +22570,12 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_agent_one_as_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -16751,6 +22593,12 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16766,6 +22614,12 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16781,6 +22635,12 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16796,6 +22656,12 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16811,6 +22677,12 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: local_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16826,6 +22698,12 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: local_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16841,6 +22719,12 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -16856,6 +22740,12 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -16872,6 +22762,12 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -16896,12 +22792,16 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_ret_cmpxchg( ; GFX6-LABEL: local_agent_one_as_release_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -16914,6 +22814,12 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_ret_cmpxchg( ; GFX7-LABEL: local_agent_one_as_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -16931,6 +22837,12 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: local_agent_one_as_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16946,6 +22858,12 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_ret_cmpxchg( ; GFX10-CU-LABEL: local_agent_one_as_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16961,6 +22879,12 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_agent_one_as_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -16978,6 +22902,12 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16993,6 +22923,12 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17008,6 +22944,12 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_agent_one_as_release_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17023,6 +22965,12 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_agent_one_as_release_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17038,6 +22986,12 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: local_agent_one_as_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17053,6 +23007,12 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: local_agent_one_as_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17068,6 +23028,12 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_agent_one_as_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -17083,6 +23049,12 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_agent_one_as_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -17099,6 +23071,12 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -17123,12 +23101,16 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX6-LABEL: local_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -17141,6 +23123,12 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX7-LABEL: local_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -17158,6 +23146,12 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: local_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17173,6 +23167,12 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX10-CU-LABEL: local_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17188,6 +23188,12 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -17205,6 +23211,12 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17220,6 +23232,12 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17235,6 +23253,12 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17250,6 +23274,12 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17265,6 +23295,12 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: local_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17280,6 +23316,12 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: local_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17295,6 +23337,12 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -17310,6 +23358,12 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -17326,6 +23380,12 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -17350,12 +23410,16 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX6-LABEL: local_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -17368,6 +23432,12 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX7-LABEL: local_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -17385,6 +23455,12 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: local_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17400,6 +23476,12 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX10-CU-LABEL: local_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17415,6 +23497,12 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -17432,6 +23520,12 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17447,6 +23541,12 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17462,6 +23562,12 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17477,6 +23583,12 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17492,6 +23604,12 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: local_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17507,6 +23625,12 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: local_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17522,6 +23646,12 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -17537,6 +23667,12 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -17553,6 +23689,12 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -17577,12 +23719,16 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -17595,6 +23741,12 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX7-LABEL: local_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -17612,6 +23764,12 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX10-WGP-LABEL: local_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17627,6 +23785,12 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX10-CU-LABEL: local_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17642,6 +23806,12 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -17659,6 +23829,12 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17674,6 +23850,12 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17689,6 +23871,12 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17704,6 +23892,12 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17719,6 +23913,12 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: local_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17734,6 +23934,12 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: local_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17749,6 +23955,12 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -17764,6 +23976,12 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -17780,6 +23998,12 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -17804,12 +24028,16 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -17822,6 +24050,12 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX7-LABEL: local_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -17839,6 +24073,12 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX10-WGP-LABEL: local_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17854,6 +24094,12 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX10-CU-LABEL: local_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17869,6 +24115,12 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -17886,6 +24138,12 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17901,6 +24159,12 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17916,6 +24180,12 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17931,6 +24201,12 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17946,6 +24222,12 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: local_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17961,6 +24243,12 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: local_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17976,6 +24264,12 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -17991,6 +24285,12 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -18007,6 +24307,12 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -18031,12 +24337,16 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -18049,6 +24359,12 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX7-LABEL: local_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -18066,6 +24382,12 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX10-WGP-LABEL: local_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -18081,6 +24403,12 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX10-CU-LABEL: local_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -18096,6 +24424,12 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_agent_one_as_release_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -18113,6 +24447,12 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18128,6 +24468,12 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18143,6 +24489,12 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18158,6 +24510,12 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18173,6 +24531,12 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: local_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -18188,6 +24552,12 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: local_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -18203,6 +24573,12 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -18218,6 +24594,12 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -18234,6 +24616,12 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -18258,12 +24646,16 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -18276,6 +24668,12 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-LABEL: local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -18293,6 +24691,12 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-WGP-LABEL: local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -18308,6 +24712,12 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-CU-LABEL: local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -18323,6 +24733,12 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -18340,6 +24756,12 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18355,6 +24777,12 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18370,6 +24798,12 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18385,6 +24819,12 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18400,6 +24840,12 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -18415,6 +24861,12 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -18430,6 +24882,12 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -18445,6 +24903,12 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -18461,6 +24925,12 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -18485,12 +24955,16 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -18503,6 +24977,12 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-LABEL: local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -18520,6 +25000,12 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-WGP-LABEL: local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -18535,6 +25021,12 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-CU-LABEL: local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -18550,6 +25042,12 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -18567,6 +25065,12 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18582,6 +25086,12 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18597,6 +25107,12 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18612,6 +25128,12 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18627,6 +25149,12 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -18642,6 +25170,12 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -18657,6 +25191,12 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -18672,6 +25212,12 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -18688,6 +25234,12 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-cluster.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-cluster.ll index 910eae0c8931..9dfdf42255ea 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-cluster.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-cluster.ll @@ -33,135 +33,169 @@ define amdgpu_kernel void @local_cluster_unordered_load( ; ; GFX7-LABEL: local_cluster_unordered_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: ds_read_b32 v1, v0 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_cluster_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_cluster_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_cluster_unordered_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_cluster_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_cluster_unordered_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_cluster_unordered_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_cluster_unordered_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_cluster_unordered_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_cluster_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: ds_load_b32 v1, v0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: ds_store_b32 v0, v1 @@ -169,11 +203,15 @@ define amdgpu_kernel void @local_cluster_unordered_load( ; ; GFX12-CU-LABEL: local_cluster_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 @@ -182,11 +220,15 @@ define amdgpu_kernel void @local_cluster_unordered_load( ; GFX1250-LABEL: local_cluster_unordered_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: ds_store_b32 v0, v1 @@ -217,135 +259,169 @@ define amdgpu_kernel void @local_cluster_monotonic_load( ; ; GFX7-LABEL: local_cluster_monotonic_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: ds_read_b32 v1, v0 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_cluster_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_cluster_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_cluster_monotonic_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_cluster_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_cluster_monotonic_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_cluster_monotonic_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_cluster_monotonic_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_cluster_monotonic_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_cluster_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: ds_load_b32 v1, v0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: ds_store_b32 v0, v1 @@ -353,11 +429,15 @@ define amdgpu_kernel void @local_cluster_monotonic_load( ; ; GFX12-CU-LABEL: local_cluster_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 @@ -366,11 +446,15 @@ define amdgpu_kernel void @local_cluster_monotonic_load( ; GFX1250-LABEL: local_cluster_monotonic_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: ds_store_b32 v0, v1 @@ -401,10 +485,13 @@ define amdgpu_kernel void @local_cluster_acquire_load( ; ; GFX7-LABEL: local_cluster_acquire_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: ds_read_b32 v1, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -415,9 +502,12 @@ define amdgpu_kernel void @local_cluster_acquire_load( ; ; GFX10-WGP-LABEL: local_cluster_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -428,9 +518,12 @@ define amdgpu_kernel void @local_cluster_acquire_load( ; ; GFX10-CU-LABEL: local_cluster_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -440,10 +533,13 @@ define amdgpu_kernel void @local_cluster_acquire_load( ; ; SKIP-CACHE-INV-LABEL: local_cluster_acquire_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -454,9 +550,12 @@ define amdgpu_kernel void @local_cluster_acquire_load( ; ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -466,22 +565,28 @@ define amdgpu_kernel void @local_cluster_acquire_load( ; ; GFX90A-TGSPLIT-LABEL: local_cluster_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_cluster_acquire_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -491,22 +596,28 @@ define amdgpu_kernel void @local_cluster_acquire_load( ; ; GFX942-TGSPLIT-LABEL: local_cluster_acquire_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v0 ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 -; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_cluster_acquire_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -517,9 +628,12 @@ define amdgpu_kernel void @local_cluster_acquire_load( ; ; GFX11-CU-LABEL: local_cluster_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -529,25 +643,33 @@ define amdgpu_kernel void @local_cluster_acquire_load( ; ; GFX12-WGP-LABEL: local_cluster_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: ds_load_b32 v1, v0 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: ds_store_b32 v0, v1 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_cluster_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm @@ -555,12 +677,16 @@ define amdgpu_kernel void @local_cluster_acquire_load( ; GFX1250-LABEL: local_cluster_acquire_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: ds_load_b32 v1, v0 ; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: ds_store_b32 v0, v1 ; GFX1250-NEXT: s_endpgm @@ -591,10 +717,13 @@ define amdgpu_kernel void @local_cluster_seq_cst_load( ; ; GFX7-LABEL: local_cluster_seq_cst_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_read_b32 v1, v0 @@ -606,9 +735,12 @@ define amdgpu_kernel void @local_cluster_seq_cst_load( ; ; GFX10-WGP-LABEL: local_cluster_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -621,9 +753,12 @@ define amdgpu_kernel void @local_cluster_seq_cst_load( ; ; GFX10-CU-LABEL: local_cluster_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -635,10 +770,13 @@ define amdgpu_kernel void @local_cluster_seq_cst_load( ; ; SKIP-CACHE-INV-LABEL: local_cluster_seq_cst_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 @@ -650,9 +788,12 @@ define amdgpu_kernel void @local_cluster_seq_cst_load( ; ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -663,9 +804,12 @@ define amdgpu_kernel void @local_cluster_seq_cst_load( ; ; GFX90A-TGSPLIT-LABEL: local_cluster_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -677,9 +821,12 @@ define amdgpu_kernel void @local_cluster_seq_cst_load( ; ; GFX942-NOTTGSPLIT-LABEL: local_cluster_seq_cst_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -690,9 +837,12 @@ define amdgpu_kernel void @local_cluster_seq_cst_load( ; ; GFX942-TGSPLIT-LABEL: local_cluster_seq_cst_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -704,9 +854,12 @@ define amdgpu_kernel void @local_cluster_seq_cst_load( ; ; GFX11-WGP-LABEL: local_cluster_seq_cst_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -719,9 +872,12 @@ define amdgpu_kernel void @local_cluster_seq_cst_load( ; ; GFX11-CU-LABEL: local_cluster_seq_cst_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -733,9 +889,12 @@ define amdgpu_kernel void @local_cluster_seq_cst_load( ; ; GFX12-WGP-LABEL: local_cluster_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 @@ -744,15 +903,19 @@ define amdgpu_kernel void @local_cluster_seq_cst_load( ; GFX12-WGP-NEXT: ds_load_b32 v1, v0 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: ds_store_b32 v0, v1 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_cluster_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 @@ -760,6 +923,7 @@ define amdgpu_kernel void @local_cluster_seq_cst_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm @@ -767,14 +931,18 @@ define amdgpu_kernel void @local_cluster_seq_cst_load( ; GFX1250-LABEL: local_cluster_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_load_b32 v1, v0 ; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: ds_store_b32 v0, v1 ; GFX1250-NEXT: s_endpgm @@ -788,12 +956,14 @@ entry: define amdgpu_kernel void @local_cluster_unordered_store( ; GFX6-LABEL: local_cluster_unordered_store: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm @@ -801,6 +971,10 @@ define amdgpu_kernel void @local_cluster_unordered_store( ; GFX7-LABEL: local_cluster_unordered_store: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -812,6 +986,10 @@ define amdgpu_kernel void @local_cluster_unordered_store( ; GFX10-WGP-LABEL: local_cluster_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 @@ -822,6 +1000,10 @@ define amdgpu_kernel void @local_cluster_unordered_store( ; GFX10-CU-LABEL: local_cluster_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 @@ -832,6 +1014,10 @@ define amdgpu_kernel void @local_cluster_unordered_store( ; SKIP-CACHE-INV-LABEL: local_cluster_unordered_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -843,6 +1029,10 @@ define amdgpu_kernel void @local_cluster_unordered_store( ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -853,6 +1043,10 @@ define amdgpu_kernel void @local_cluster_unordered_store( ; GFX90A-TGSPLIT-LABEL: local_cluster_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -863,6 +1057,10 @@ define amdgpu_kernel void @local_cluster_unordered_store( ; GFX942-NOTTGSPLIT-LABEL: local_cluster_unordered_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -873,6 +1071,10 @@ define amdgpu_kernel void @local_cluster_unordered_store( ; GFX942-TGSPLIT-LABEL: local_cluster_unordered_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -883,6 +1085,10 @@ define amdgpu_kernel void @local_cluster_unordered_store( ; GFX11-WGP-LABEL: local_cluster_unordered_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -893,6 +1099,10 @@ define amdgpu_kernel void @local_cluster_unordered_store( ; GFX11-CU-LABEL: local_cluster_unordered_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -903,6 +1113,10 @@ define amdgpu_kernel void @local_cluster_unordered_store( ; GFX12-WGP-LABEL: local_cluster_unordered_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -913,6 +1127,10 @@ define amdgpu_kernel void @local_cluster_unordered_store( ; GFX12-CU-LABEL: local_cluster_unordered_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -924,6 +1142,10 @@ define amdgpu_kernel void @local_cluster_unordered_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 @@ -939,12 +1161,14 @@ entry: define amdgpu_kernel void @local_cluster_monotonic_store( ; GFX6-LABEL: local_cluster_monotonic_store: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm @@ -952,6 +1176,10 @@ define amdgpu_kernel void @local_cluster_monotonic_store( ; GFX7-LABEL: local_cluster_monotonic_store: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -963,6 +1191,10 @@ define amdgpu_kernel void @local_cluster_monotonic_store( ; GFX10-WGP-LABEL: local_cluster_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 @@ -973,6 +1205,10 @@ define amdgpu_kernel void @local_cluster_monotonic_store( ; GFX10-CU-LABEL: local_cluster_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 @@ -983,6 +1219,10 @@ define amdgpu_kernel void @local_cluster_monotonic_store( ; SKIP-CACHE-INV-LABEL: local_cluster_monotonic_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -994,6 +1234,10 @@ define amdgpu_kernel void @local_cluster_monotonic_store( ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -1004,6 +1248,10 @@ define amdgpu_kernel void @local_cluster_monotonic_store( ; GFX90A-TGSPLIT-LABEL: local_cluster_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -1014,6 +1262,10 @@ define amdgpu_kernel void @local_cluster_monotonic_store( ; GFX942-NOTTGSPLIT-LABEL: local_cluster_monotonic_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -1024,6 +1276,10 @@ define amdgpu_kernel void @local_cluster_monotonic_store( ; GFX942-TGSPLIT-LABEL: local_cluster_monotonic_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -1034,6 +1290,10 @@ define amdgpu_kernel void @local_cluster_monotonic_store( ; GFX11-WGP-LABEL: local_cluster_monotonic_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -1044,6 +1304,10 @@ define amdgpu_kernel void @local_cluster_monotonic_store( ; GFX11-CU-LABEL: local_cluster_monotonic_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -1054,6 +1318,10 @@ define amdgpu_kernel void @local_cluster_monotonic_store( ; GFX12-WGP-LABEL: local_cluster_monotonic_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -1064,6 +1332,10 @@ define amdgpu_kernel void @local_cluster_monotonic_store( ; GFX12-CU-LABEL: local_cluster_monotonic_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -1075,6 +1347,10 @@ define amdgpu_kernel void @local_cluster_monotonic_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 @@ -1090,12 +1366,14 @@ entry: define amdgpu_kernel void @local_cluster_release_store( ; GFX6-LABEL: local_cluster_release_store: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -1104,6 +1382,10 @@ define amdgpu_kernel void @local_cluster_release_store( ; GFX7-LABEL: local_cluster_release_store: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1116,6 +1398,10 @@ define amdgpu_kernel void @local_cluster_release_store( ; GFX10-WGP-LABEL: local_cluster_release_store: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 @@ -1128,6 +1414,10 @@ define amdgpu_kernel void @local_cluster_release_store( ; GFX10-CU-LABEL: local_cluster_release_store: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 @@ -1140,6 +1430,10 @@ define amdgpu_kernel void @local_cluster_release_store( ; SKIP-CACHE-INV-LABEL: local_cluster_release_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -1152,6 +1446,10 @@ define amdgpu_kernel void @local_cluster_release_store( ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -1163,6 +1461,10 @@ define amdgpu_kernel void @local_cluster_release_store( ; GFX90A-TGSPLIT-LABEL: local_cluster_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -1174,6 +1476,10 @@ define amdgpu_kernel void @local_cluster_release_store( ; GFX942-NOTTGSPLIT-LABEL: local_cluster_release_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -1185,6 +1491,10 @@ define amdgpu_kernel void @local_cluster_release_store( ; GFX942-TGSPLIT-LABEL: local_cluster_release_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -1196,6 +1506,10 @@ define amdgpu_kernel void @local_cluster_release_store( ; GFX11-WGP-LABEL: local_cluster_release_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -1208,6 +1522,10 @@ define amdgpu_kernel void @local_cluster_release_store( ; GFX11-CU-LABEL: local_cluster_release_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -1220,6 +1538,10 @@ define amdgpu_kernel void @local_cluster_release_store( ; GFX12-WGP-LABEL: local_cluster_release_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -1234,6 +1556,10 @@ define amdgpu_kernel void @local_cluster_release_store( ; GFX12-CU-LABEL: local_cluster_release_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -1249,6 +1575,10 @@ define amdgpu_kernel void @local_cluster_release_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 @@ -1266,12 +1596,14 @@ entry: define amdgpu_kernel void @local_cluster_seq_cst_store( ; GFX6-LABEL: local_cluster_seq_cst_store: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -1280,6 +1612,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_store( ; GFX7-LABEL: local_cluster_seq_cst_store: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1292,6 +1628,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_store( ; GFX10-WGP-LABEL: local_cluster_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 @@ -1304,6 +1644,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_store( ; GFX10-CU-LABEL: local_cluster_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 @@ -1316,6 +1660,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_store( ; SKIP-CACHE-INV-LABEL: local_cluster_seq_cst_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -1328,6 +1676,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_store( ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -1339,6 +1691,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_store( ; GFX90A-TGSPLIT-LABEL: local_cluster_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -1350,6 +1706,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_store( ; GFX942-NOTTGSPLIT-LABEL: local_cluster_seq_cst_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -1361,6 +1721,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_store( ; GFX942-TGSPLIT-LABEL: local_cluster_seq_cst_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -1372,6 +1736,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_store( ; GFX11-WGP-LABEL: local_cluster_seq_cst_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -1384,6 +1752,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_store( ; GFX11-CU-LABEL: local_cluster_seq_cst_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -1396,6 +1768,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_store( ; GFX12-WGP-LABEL: local_cluster_seq_cst_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -1410,6 +1786,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_store( ; GFX12-CU-LABEL: local_cluster_seq_cst_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -1425,6 +1805,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 @@ -1443,133 +1827,183 @@ define amdgpu_kernel void @local_cluster_monotonic_atomicrmw( ; GFX6-LABEL: local_cluster_monotonic_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_cluster_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_cluster_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_cluster_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_cluster_monotonic_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_cluster_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_cluster_monotonic_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_cluster_monotonic_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_cluster_monotonic_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_cluster_monotonic_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_cluster_monotonic_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_cluster_monotonic_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm @@ -1577,10 +2011,14 @@ define amdgpu_kernel void @local_cluster_monotonic_atomicrmw( ; GFX1250-LABEL: local_cluster_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s0 ; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX1250-NEXT: s_endpgm @@ -1594,11 +2032,13 @@ define amdgpu_kernel void @local_cluster_acquire_atomicrmw( ; GFX6-LABEL: local_cluster_acquire_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -1606,11 +2046,15 @@ define amdgpu_kernel void @local_cluster_acquire_atomicrmw( ; ; GFX7-LABEL: local_cluster_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1618,10 +2062,14 @@ define amdgpu_kernel void @local_cluster_acquire_atomicrmw( ; ; GFX10-WGP-LABEL: local_cluster_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1630,10 +2078,14 @@ define amdgpu_kernel void @local_cluster_acquire_atomicrmw( ; ; GFX10-CU-LABEL: local_cluster_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1641,11 +2093,15 @@ define amdgpu_kernel void @local_cluster_acquire_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_cluster_acquire_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -1653,10 +2109,14 @@ define amdgpu_kernel void @local_cluster_acquire_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1664,10 +2124,14 @@ define amdgpu_kernel void @local_cluster_acquire_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_cluster_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol @@ -1675,10 +2139,14 @@ define amdgpu_kernel void @local_cluster_acquire_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: local_cluster_acquire_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1686,10 +2154,14 @@ define amdgpu_kernel void @local_cluster_acquire_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: local_cluster_acquire_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 @@ -1697,10 +2169,14 @@ define amdgpu_kernel void @local_cluster_acquire_atomicrmw( ; ; GFX11-WGP-LABEL: local_cluster_acquire_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1709,10 +2185,14 @@ define amdgpu_kernel void @local_cluster_acquire_atomicrmw( ; ; GFX11-CU-LABEL: local_cluster_acquire_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1720,10 +2200,14 @@ define amdgpu_kernel void @local_cluster_acquire_atomicrmw( ; ; GFX12-WGP-LABEL: local_cluster_acquire_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0 @@ -1732,10 +2216,14 @@ define amdgpu_kernel void @local_cluster_acquire_atomicrmw( ; ; GFX12-CU-LABEL: local_cluster_acquire_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 @@ -1744,10 +2232,14 @@ define amdgpu_kernel void @local_cluster_acquire_atomicrmw( ; GFX1250-LABEL: local_cluster_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s0 ; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -1762,11 +2254,13 @@ define amdgpu_kernel void @local_cluster_release_atomicrmw( ; GFX6-LABEL: local_cluster_release_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 @@ -1774,11 +2268,15 @@ define amdgpu_kernel void @local_cluster_release_atomicrmw( ; ; GFX7-LABEL: local_cluster_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 @@ -1786,10 +2284,14 @@ define amdgpu_kernel void @local_cluster_release_atomicrmw( ; ; GFX10-WGP-LABEL: local_cluster_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1798,10 +2300,14 @@ define amdgpu_kernel void @local_cluster_release_atomicrmw( ; ; GFX10-CU-LABEL: local_cluster_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1810,11 +2316,15 @@ define amdgpu_kernel void @local_cluster_release_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_cluster_release_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 @@ -1822,10 +2332,14 @@ define amdgpu_kernel void @local_cluster_release_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 @@ -1833,10 +2347,14 @@ define amdgpu_kernel void @local_cluster_release_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_cluster_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 @@ -1844,10 +2362,14 @@ define amdgpu_kernel void @local_cluster_release_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: local_cluster_release_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 @@ -1855,10 +2377,14 @@ define amdgpu_kernel void @local_cluster_release_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: local_cluster_release_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 @@ -1866,10 +2392,14 @@ define amdgpu_kernel void @local_cluster_release_atomicrmw( ; ; GFX11-WGP-LABEL: local_cluster_release_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1878,10 +2408,14 @@ define amdgpu_kernel void @local_cluster_release_atomicrmw( ; ; GFX11-CU-LABEL: local_cluster_release_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1890,10 +2424,14 @@ define amdgpu_kernel void @local_cluster_release_atomicrmw( ; ; GFX12-WGP-LABEL: local_cluster_release_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 @@ -1904,10 +2442,14 @@ define amdgpu_kernel void @local_cluster_release_atomicrmw( ; ; GFX12-CU-LABEL: local_cluster_release_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 @@ -1919,10 +2461,14 @@ define amdgpu_kernel void @local_cluster_release_atomicrmw( ; GFX1250-LABEL: local_cluster_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -1938,11 +2484,13 @@ define amdgpu_kernel void @local_cluster_acq_rel_atomicrmw( ; GFX6-LABEL: local_cluster_acq_rel_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 @@ -1951,11 +2499,15 @@ define amdgpu_kernel void @local_cluster_acq_rel_atomicrmw( ; ; GFX7-LABEL: local_cluster_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 @@ -1964,10 +2516,14 @@ define amdgpu_kernel void @local_cluster_acq_rel_atomicrmw( ; ; GFX10-WGP-LABEL: local_cluster_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1978,10 +2534,14 @@ define amdgpu_kernel void @local_cluster_acq_rel_atomicrmw( ; ; GFX10-CU-LABEL: local_cluster_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1991,11 +2551,15 @@ define amdgpu_kernel void @local_cluster_acq_rel_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_cluster_acq_rel_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 @@ -2004,10 +2568,14 @@ define amdgpu_kernel void @local_cluster_acq_rel_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 @@ -2016,10 +2584,14 @@ define amdgpu_kernel void @local_cluster_acq_rel_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_cluster_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 @@ -2028,10 +2600,14 @@ define amdgpu_kernel void @local_cluster_acq_rel_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: local_cluster_acq_rel_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 @@ -2040,10 +2616,14 @@ define amdgpu_kernel void @local_cluster_acq_rel_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: local_cluster_acq_rel_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 @@ -2052,10 +2632,14 @@ define amdgpu_kernel void @local_cluster_acq_rel_atomicrmw( ; ; GFX11-WGP-LABEL: local_cluster_acq_rel_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2066,10 +2650,14 @@ define amdgpu_kernel void @local_cluster_acq_rel_atomicrmw( ; ; GFX11-CU-LABEL: local_cluster_acq_rel_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2079,10 +2667,14 @@ define amdgpu_kernel void @local_cluster_acq_rel_atomicrmw( ; ; GFX12-WGP-LABEL: local_cluster_acq_rel_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 @@ -2095,10 +2687,14 @@ define amdgpu_kernel void @local_cluster_acq_rel_atomicrmw( ; ; GFX12-CU-LABEL: local_cluster_acq_rel_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 @@ -2111,10 +2707,14 @@ define amdgpu_kernel void @local_cluster_acq_rel_atomicrmw( ; GFX1250-LABEL: local_cluster_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -2131,11 +2731,13 @@ define amdgpu_kernel void @local_cluster_seq_cst_atomicrmw( ; GFX6-LABEL: local_cluster_seq_cst_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 @@ -2144,11 +2746,15 @@ define amdgpu_kernel void @local_cluster_seq_cst_atomicrmw( ; ; GFX7-LABEL: local_cluster_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 @@ -2157,10 +2763,14 @@ define amdgpu_kernel void @local_cluster_seq_cst_atomicrmw( ; ; GFX10-WGP-LABEL: local_cluster_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2171,10 +2781,14 @@ define amdgpu_kernel void @local_cluster_seq_cst_atomicrmw( ; ; GFX10-CU-LABEL: local_cluster_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2184,11 +2798,15 @@ define amdgpu_kernel void @local_cluster_seq_cst_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_cluster_seq_cst_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 @@ -2197,10 +2815,14 @@ define amdgpu_kernel void @local_cluster_seq_cst_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 @@ -2209,10 +2831,14 @@ define amdgpu_kernel void @local_cluster_seq_cst_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_cluster_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 @@ -2221,10 +2847,14 @@ define amdgpu_kernel void @local_cluster_seq_cst_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: local_cluster_seq_cst_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 @@ -2233,10 +2863,14 @@ define amdgpu_kernel void @local_cluster_seq_cst_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: local_cluster_seq_cst_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 @@ -2245,10 +2879,14 @@ define amdgpu_kernel void @local_cluster_seq_cst_atomicrmw( ; ; GFX11-WGP-LABEL: local_cluster_seq_cst_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2259,10 +2897,14 @@ define amdgpu_kernel void @local_cluster_seq_cst_atomicrmw( ; ; GFX11-CU-LABEL: local_cluster_seq_cst_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2272,10 +2914,14 @@ define amdgpu_kernel void @local_cluster_seq_cst_atomicrmw( ; ; GFX12-WGP-LABEL: local_cluster_seq_cst_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 @@ -2288,10 +2934,14 @@ define amdgpu_kernel void @local_cluster_seq_cst_atomicrmw( ; ; GFX12-CU-LABEL: local_cluster_seq_cst_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 @@ -2304,10 +2954,14 @@ define amdgpu_kernel void @local_cluster_seq_cst_atomicrmw( ; GFX1250-LABEL: local_cluster_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -2324,11 +2978,13 @@ define amdgpu_kernel void @local_cluster_acquire_ret_atomicrmw( ; GFX6-LABEL: local_cluster_acquire_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -2340,6 +2996,10 @@ define amdgpu_kernel void @local_cluster_acquire_ret_atomicrmw( ; GFX7-LABEL: local_cluster_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2355,6 +3015,10 @@ define amdgpu_kernel void @local_cluster_acquire_ret_atomicrmw( ; GFX10-WGP-LABEL: local_cluster_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -2369,6 +3033,10 @@ define amdgpu_kernel void @local_cluster_acquire_ret_atomicrmw( ; GFX10-CU-LABEL: local_cluster_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -2382,6 +3050,10 @@ define amdgpu_kernel void @local_cluster_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: local_cluster_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -2397,6 +3069,10 @@ define amdgpu_kernel void @local_cluster_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -2410,6 +3086,10 @@ define amdgpu_kernel void @local_cluster_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-LABEL: local_cluster_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -2424,6 +3104,10 @@ define amdgpu_kernel void @local_cluster_acquire_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: local_cluster_acquire_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 @@ -2437,6 +3121,10 @@ define amdgpu_kernel void @local_cluster_acquire_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: local_cluster_acquire_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 @@ -2451,6 +3139,10 @@ define amdgpu_kernel void @local_cluster_acquire_ret_atomicrmw( ; GFX11-WGP-LABEL: local_cluster_acquire_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -2465,6 +3157,10 @@ define amdgpu_kernel void @local_cluster_acquire_ret_atomicrmw( ; GFX11-CU-LABEL: local_cluster_acquire_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2478,6 +3174,10 @@ define amdgpu_kernel void @local_cluster_acquire_ret_atomicrmw( ; GFX12-WGP-LABEL: local_cluster_acquire_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -2492,6 +3192,10 @@ define amdgpu_kernel void @local_cluster_acquire_ret_atomicrmw( ; GFX12-CU-LABEL: local_cluster_acquire_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2506,6 +3210,10 @@ define amdgpu_kernel void @local_cluster_acquire_ret_atomicrmw( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 @@ -2526,11 +3234,13 @@ define amdgpu_kernel void @local_cluster_acq_rel_ret_atomicrmw( ; GFX6-LABEL: local_cluster_acq_rel_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 @@ -2543,6 +3253,10 @@ define amdgpu_kernel void @local_cluster_acq_rel_ret_atomicrmw( ; GFX7-LABEL: local_cluster_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2559,6 +3273,10 @@ define amdgpu_kernel void @local_cluster_acq_rel_ret_atomicrmw( ; GFX10-WGP-LABEL: local_cluster_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -2575,6 +3293,10 @@ define amdgpu_kernel void @local_cluster_acq_rel_ret_atomicrmw( ; GFX10-CU-LABEL: local_cluster_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -2590,6 +3312,10 @@ define amdgpu_kernel void @local_cluster_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: local_cluster_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -2606,6 +3332,10 @@ define amdgpu_kernel void @local_cluster_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -2620,6 +3350,10 @@ define amdgpu_kernel void @local_cluster_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-LABEL: local_cluster_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -2635,6 +3369,10 @@ define amdgpu_kernel void @local_cluster_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: local_cluster_acq_rel_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 @@ -2649,6 +3387,10 @@ define amdgpu_kernel void @local_cluster_acq_rel_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: local_cluster_acq_rel_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 @@ -2664,6 +3406,10 @@ define amdgpu_kernel void @local_cluster_acq_rel_ret_atomicrmw( ; GFX11-WGP-LABEL: local_cluster_acq_rel_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -2680,6 +3426,10 @@ define amdgpu_kernel void @local_cluster_acq_rel_ret_atomicrmw( ; GFX11-CU-LABEL: local_cluster_acq_rel_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2695,6 +3445,10 @@ define amdgpu_kernel void @local_cluster_acq_rel_ret_atomicrmw( ; GFX12-WGP-LABEL: local_cluster_acq_rel_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -2713,6 +3467,10 @@ define amdgpu_kernel void @local_cluster_acq_rel_ret_atomicrmw( ; GFX12-CU-LABEL: local_cluster_acq_rel_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2731,6 +3489,10 @@ define amdgpu_kernel void @local_cluster_acq_rel_ret_atomicrmw( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 @@ -2753,11 +3515,13 @@ define amdgpu_kernel void @local_cluster_seq_cst_ret_atomicrmw( ; GFX6-LABEL: local_cluster_seq_cst_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 @@ -2770,6 +3534,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_ret_atomicrmw( ; GFX7-LABEL: local_cluster_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2786,6 +3554,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_ret_atomicrmw( ; GFX10-WGP-LABEL: local_cluster_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -2802,6 +3574,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_ret_atomicrmw( ; GFX10-CU-LABEL: local_cluster_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -2817,6 +3593,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: local_cluster_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -2833,6 +3613,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -2847,6 +3631,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-LABEL: local_cluster_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -2862,6 +3650,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: local_cluster_seq_cst_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 @@ -2876,6 +3668,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: local_cluster_seq_cst_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 @@ -2891,6 +3687,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_ret_atomicrmw( ; GFX11-WGP-LABEL: local_cluster_seq_cst_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -2907,6 +3707,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_ret_atomicrmw( ; GFX11-CU-LABEL: local_cluster_seq_cst_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2922,6 +3726,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_ret_atomicrmw( ; GFX12-WGP-LABEL: local_cluster_seq_cst_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -2940,6 +3748,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_ret_atomicrmw( ; GFX12-CU-LABEL: local_cluster_seq_cst_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2958,6 +3770,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_ret_atomicrmw( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 @@ -2980,12 +3796,16 @@ define amdgpu_kernel void @local_cluster_monotonic_monotonic_cmpxchg( ; GFX6-LABEL: local_cluster_monotonic_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -2993,12 +3813,18 @@ define amdgpu_kernel void @local_cluster_monotonic_monotonic_cmpxchg( ; ; GFX7-LABEL: local_cluster_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3006,11 +3832,17 @@ define amdgpu_kernel void @local_cluster_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_cluster_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3018,11 +3850,17 @@ define amdgpu_kernel void @local_cluster_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_cluster_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3030,12 +3868,18 @@ define amdgpu_kernel void @local_cluster_monotonic_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_cluster_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3043,11 +3887,17 @@ define amdgpu_kernel void @local_cluster_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3055,11 +3905,17 @@ define amdgpu_kernel void @local_cluster_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_cluster_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3067,11 +3923,17 @@ define amdgpu_kernel void @local_cluster_monotonic_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_cluster_monotonic_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3079,11 +3941,17 @@ define amdgpu_kernel void @local_cluster_monotonic_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_cluster_monotonic_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3091,48 +3959,72 @@ define amdgpu_kernel void @local_cluster_monotonic_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_cluster_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_cluster_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_cluster_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_cluster_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -3140,12 +4032,18 @@ define amdgpu_kernel void @local_cluster_monotonic_monotonic_cmpxchg( ; GFX1250-LABEL: local_cluster_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -3160,12 +4058,16 @@ define amdgpu_kernel void @local_cluster_acquire_monotonic_cmpxchg( ; GFX6-LABEL: local_cluster_acquire_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3174,12 +4076,18 @@ define amdgpu_kernel void @local_cluster_acquire_monotonic_cmpxchg( ; ; GFX7-LABEL: local_cluster_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3188,11 +4096,17 @@ define amdgpu_kernel void @local_cluster_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_cluster_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3202,11 +4116,17 @@ define amdgpu_kernel void @local_cluster_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_cluster_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3215,12 +4135,18 @@ define amdgpu_kernel void @local_cluster_acquire_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_cluster_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3229,11 +4155,17 @@ define amdgpu_kernel void @local_cluster_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3242,11 +4174,17 @@ define amdgpu_kernel void @local_cluster_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_cluster_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3255,11 +4193,17 @@ define amdgpu_kernel void @local_cluster_acquire_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_cluster_acquire_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3268,11 +4212,17 @@ define amdgpu_kernel void @local_cluster_acquire_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_cluster_acquire_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3281,12 +4231,18 @@ define amdgpu_kernel void @local_cluster_acquire_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_cluster_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -3295,12 +4251,18 @@ define amdgpu_kernel void @local_cluster_acquire_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: local_cluster_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -3308,12 +4270,18 @@ define amdgpu_kernel void @local_cluster_acquire_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: local_cluster_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0 @@ -3322,12 +4290,18 @@ define amdgpu_kernel void @local_cluster_acquire_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: local_cluster_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 @@ -3336,12 +4310,18 @@ define amdgpu_kernel void @local_cluster_acquire_monotonic_cmpxchg( ; GFX1250-LABEL: local_cluster_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -3357,12 +4337,16 @@ define amdgpu_kernel void @local_cluster_release_monotonic_cmpxchg( ; GFX6-LABEL: local_cluster_release_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -3371,12 +4355,18 @@ define amdgpu_kernel void @local_cluster_release_monotonic_cmpxchg( ; ; GFX7-LABEL: local_cluster_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -3385,11 +4375,17 @@ define amdgpu_kernel void @local_cluster_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_cluster_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3399,11 +4395,17 @@ define amdgpu_kernel void @local_cluster_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_cluster_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3413,12 +4415,18 @@ define amdgpu_kernel void @local_cluster_release_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_cluster_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -3427,11 +4435,17 @@ define amdgpu_kernel void @local_cluster_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3440,11 +4454,17 @@ define amdgpu_kernel void @local_cluster_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_cluster_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3453,11 +4473,17 @@ define amdgpu_kernel void @local_cluster_release_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_cluster_release_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3466,11 +4492,17 @@ define amdgpu_kernel void @local_cluster_release_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_cluster_release_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3479,12 +4511,18 @@ define amdgpu_kernel void @local_cluster_release_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_cluster_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3493,12 +4531,18 @@ define amdgpu_kernel void @local_cluster_release_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: local_cluster_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3507,12 +4551,18 @@ define amdgpu_kernel void @local_cluster_release_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: local_cluster_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 @@ -3523,12 +4573,18 @@ define amdgpu_kernel void @local_cluster_release_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: local_cluster_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 @@ -3540,12 +4596,18 @@ define amdgpu_kernel void @local_cluster_release_monotonic_cmpxchg( ; GFX1250-LABEL: local_cluster_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -3562,12 +4624,16 @@ define amdgpu_kernel void @local_cluster_acq_rel_monotonic_cmpxchg( ; GFX6-LABEL: local_cluster_acq_rel_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -3577,12 +4643,18 @@ define amdgpu_kernel void @local_cluster_acq_rel_monotonic_cmpxchg( ; ; GFX7-LABEL: local_cluster_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -3592,11 +4664,17 @@ define amdgpu_kernel void @local_cluster_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_cluster_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3608,11 +4686,17 @@ define amdgpu_kernel void @local_cluster_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_cluster_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3623,12 +4707,18 @@ define amdgpu_kernel void @local_cluster_acq_rel_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_cluster_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -3638,11 +4728,17 @@ define amdgpu_kernel void @local_cluster_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3652,11 +4748,17 @@ define amdgpu_kernel void @local_cluster_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_cluster_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3666,11 +4768,17 @@ define amdgpu_kernel void @local_cluster_acq_rel_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_cluster_acq_rel_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3680,11 +4788,17 @@ define amdgpu_kernel void @local_cluster_acq_rel_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_cluster_acq_rel_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3694,12 +4808,18 @@ define amdgpu_kernel void @local_cluster_acq_rel_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_cluster_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3710,12 +4830,18 @@ define amdgpu_kernel void @local_cluster_acq_rel_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: local_cluster_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3725,12 +4851,18 @@ define amdgpu_kernel void @local_cluster_acq_rel_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: local_cluster_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 @@ -3743,12 +4875,18 @@ define amdgpu_kernel void @local_cluster_acq_rel_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: local_cluster_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 @@ -3761,12 +4899,18 @@ define amdgpu_kernel void @local_cluster_acq_rel_monotonic_cmpxchg( ; GFX1250-LABEL: local_cluster_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -3784,12 +4928,16 @@ define amdgpu_kernel void @local_cluster_seq_cst_monotonic_cmpxchg( ; GFX6-LABEL: local_cluster_seq_cst_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -3799,12 +4947,18 @@ define amdgpu_kernel void @local_cluster_seq_cst_monotonic_cmpxchg( ; ; GFX7-LABEL: local_cluster_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -3814,11 +4968,17 @@ define amdgpu_kernel void @local_cluster_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_cluster_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3830,11 +4990,17 @@ define amdgpu_kernel void @local_cluster_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_cluster_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3845,12 +5011,18 @@ define amdgpu_kernel void @local_cluster_seq_cst_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_cluster_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -3860,11 +5032,17 @@ define amdgpu_kernel void @local_cluster_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3874,11 +5052,17 @@ define amdgpu_kernel void @local_cluster_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_cluster_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3888,11 +5072,17 @@ define amdgpu_kernel void @local_cluster_seq_cst_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_cluster_seq_cst_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3902,11 +5092,17 @@ define amdgpu_kernel void @local_cluster_seq_cst_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_cluster_seq_cst_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3916,12 +5112,18 @@ define amdgpu_kernel void @local_cluster_seq_cst_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_cluster_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3932,12 +5134,18 @@ define amdgpu_kernel void @local_cluster_seq_cst_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: local_cluster_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3947,12 +5155,18 @@ define amdgpu_kernel void @local_cluster_seq_cst_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: local_cluster_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 @@ -3965,12 +5179,18 @@ define amdgpu_kernel void @local_cluster_seq_cst_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: local_cluster_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 @@ -3983,12 +5203,18 @@ define amdgpu_kernel void @local_cluster_seq_cst_monotonic_cmpxchg( ; GFX1250-LABEL: local_cluster_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -4006,12 +5232,16 @@ define amdgpu_kernel void @local_cluster_monotonic_acquire_cmpxchg( ; GFX6-LABEL: local_cluster_monotonic_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4020,12 +5250,18 @@ define amdgpu_kernel void @local_cluster_monotonic_acquire_cmpxchg( ; ; GFX7-LABEL: local_cluster_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4034,11 +5270,17 @@ define amdgpu_kernel void @local_cluster_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_cluster_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4048,11 +5290,17 @@ define amdgpu_kernel void @local_cluster_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_cluster_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4061,12 +5309,18 @@ define amdgpu_kernel void @local_cluster_monotonic_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_cluster_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4075,11 +5329,17 @@ define amdgpu_kernel void @local_cluster_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4088,11 +5348,17 @@ define amdgpu_kernel void @local_cluster_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_cluster_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4101,11 +5367,17 @@ define amdgpu_kernel void @local_cluster_monotonic_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_cluster_monotonic_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4114,11 +5386,17 @@ define amdgpu_kernel void @local_cluster_monotonic_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_cluster_monotonic_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4127,12 +5405,18 @@ define amdgpu_kernel void @local_cluster_monotonic_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_cluster_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4141,12 +5425,18 @@ define amdgpu_kernel void @local_cluster_monotonic_acquire_cmpxchg( ; ; GFX11-CU-LABEL: local_cluster_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4154,12 +5444,18 @@ define amdgpu_kernel void @local_cluster_monotonic_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: local_cluster_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0 @@ -4168,12 +5464,18 @@ define amdgpu_kernel void @local_cluster_monotonic_acquire_cmpxchg( ; ; GFX12-CU-LABEL: local_cluster_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 @@ -4182,12 +5484,18 @@ define amdgpu_kernel void @local_cluster_monotonic_acquire_cmpxchg( ; GFX1250-LABEL: local_cluster_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -4203,12 +5511,16 @@ define amdgpu_kernel void @local_cluster_acquire_acquire_cmpxchg( ; GFX6-LABEL: local_cluster_acquire_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4217,12 +5529,18 @@ define amdgpu_kernel void @local_cluster_acquire_acquire_cmpxchg( ; ; GFX7-LABEL: local_cluster_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4231,11 +5549,17 @@ define amdgpu_kernel void @local_cluster_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_cluster_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4245,11 +5569,17 @@ define amdgpu_kernel void @local_cluster_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_cluster_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4258,12 +5588,18 @@ define amdgpu_kernel void @local_cluster_acquire_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_cluster_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4272,11 +5608,17 @@ define amdgpu_kernel void @local_cluster_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4285,11 +5627,17 @@ define amdgpu_kernel void @local_cluster_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_cluster_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4298,11 +5646,17 @@ define amdgpu_kernel void @local_cluster_acquire_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_cluster_acquire_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4311,11 +5665,17 @@ define amdgpu_kernel void @local_cluster_acquire_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_cluster_acquire_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4324,12 +5684,18 @@ define amdgpu_kernel void @local_cluster_acquire_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_cluster_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4338,12 +5704,18 @@ define amdgpu_kernel void @local_cluster_acquire_acquire_cmpxchg( ; ; GFX11-CU-LABEL: local_cluster_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4351,12 +5723,18 @@ define amdgpu_kernel void @local_cluster_acquire_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: local_cluster_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0 @@ -4365,12 +5743,18 @@ define amdgpu_kernel void @local_cluster_acquire_acquire_cmpxchg( ; ; GFX12-CU-LABEL: local_cluster_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 @@ -4379,12 +5763,18 @@ define amdgpu_kernel void @local_cluster_acquire_acquire_cmpxchg( ; GFX1250-LABEL: local_cluster_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -4400,12 +5790,16 @@ define amdgpu_kernel void @local_cluster_release_acquire_cmpxchg( ; GFX6-LABEL: local_cluster_release_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -4415,12 +5809,18 @@ define amdgpu_kernel void @local_cluster_release_acquire_cmpxchg( ; ; GFX7-LABEL: local_cluster_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -4430,11 +5830,17 @@ define amdgpu_kernel void @local_cluster_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_cluster_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4446,11 +5852,17 @@ define amdgpu_kernel void @local_cluster_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_cluster_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4461,12 +5873,18 @@ define amdgpu_kernel void @local_cluster_release_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_cluster_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -4476,11 +5894,17 @@ define amdgpu_kernel void @local_cluster_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4490,11 +5914,17 @@ define amdgpu_kernel void @local_cluster_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_cluster_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4504,11 +5934,17 @@ define amdgpu_kernel void @local_cluster_release_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_cluster_release_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4518,11 +5954,17 @@ define amdgpu_kernel void @local_cluster_release_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_cluster_release_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4532,12 +5974,18 @@ define amdgpu_kernel void @local_cluster_release_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_cluster_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4548,12 +5996,18 @@ define amdgpu_kernel void @local_cluster_release_acquire_cmpxchg( ; ; GFX11-CU-LABEL: local_cluster_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4563,12 +6017,18 @@ define amdgpu_kernel void @local_cluster_release_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: local_cluster_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 @@ -4581,12 +6041,18 @@ define amdgpu_kernel void @local_cluster_release_acquire_cmpxchg( ; ; GFX12-CU-LABEL: local_cluster_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 @@ -4599,12 +6065,18 @@ define amdgpu_kernel void @local_cluster_release_acquire_cmpxchg( ; GFX1250-LABEL: local_cluster_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -4622,12 +6094,16 @@ define amdgpu_kernel void @local_cluster_acq_rel_acquire_cmpxchg( ; GFX6-LABEL: local_cluster_acq_rel_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -4637,12 +6113,18 @@ define amdgpu_kernel void @local_cluster_acq_rel_acquire_cmpxchg( ; ; GFX7-LABEL: local_cluster_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -4652,11 +6134,17 @@ define amdgpu_kernel void @local_cluster_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_cluster_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4668,11 +6156,17 @@ define amdgpu_kernel void @local_cluster_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_cluster_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4683,12 +6177,18 @@ define amdgpu_kernel void @local_cluster_acq_rel_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_cluster_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -4698,11 +6198,17 @@ define amdgpu_kernel void @local_cluster_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4712,11 +6218,17 @@ define amdgpu_kernel void @local_cluster_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_cluster_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4726,11 +6238,17 @@ define amdgpu_kernel void @local_cluster_acq_rel_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_cluster_acq_rel_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4740,11 +6258,17 @@ define amdgpu_kernel void @local_cluster_acq_rel_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_cluster_acq_rel_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4754,12 +6278,18 @@ define amdgpu_kernel void @local_cluster_acq_rel_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_cluster_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4770,12 +6300,18 @@ define amdgpu_kernel void @local_cluster_acq_rel_acquire_cmpxchg( ; ; GFX11-CU-LABEL: local_cluster_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4785,12 +6321,18 @@ define amdgpu_kernel void @local_cluster_acq_rel_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: local_cluster_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 @@ -4803,12 +6345,18 @@ define amdgpu_kernel void @local_cluster_acq_rel_acquire_cmpxchg( ; ; GFX12-CU-LABEL: local_cluster_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 @@ -4821,12 +6369,18 @@ define amdgpu_kernel void @local_cluster_acq_rel_acquire_cmpxchg( ; GFX1250-LABEL: local_cluster_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -4844,12 +6398,16 @@ define amdgpu_kernel void @local_cluster_seq_cst_acquire_cmpxchg( ; GFX6-LABEL: local_cluster_seq_cst_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -4859,12 +6417,18 @@ define amdgpu_kernel void @local_cluster_seq_cst_acquire_cmpxchg( ; ; GFX7-LABEL: local_cluster_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -4874,11 +6438,17 @@ define amdgpu_kernel void @local_cluster_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_cluster_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4890,11 +6460,17 @@ define amdgpu_kernel void @local_cluster_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_cluster_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4905,12 +6481,18 @@ define amdgpu_kernel void @local_cluster_seq_cst_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_cluster_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -4920,11 +6502,17 @@ define amdgpu_kernel void @local_cluster_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4934,11 +6522,17 @@ define amdgpu_kernel void @local_cluster_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_cluster_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4948,11 +6542,17 @@ define amdgpu_kernel void @local_cluster_seq_cst_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_cluster_seq_cst_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4962,11 +6562,17 @@ define amdgpu_kernel void @local_cluster_seq_cst_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_cluster_seq_cst_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4976,12 +6582,18 @@ define amdgpu_kernel void @local_cluster_seq_cst_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_cluster_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4992,12 +6604,18 @@ define amdgpu_kernel void @local_cluster_seq_cst_acquire_cmpxchg( ; ; GFX11-CU-LABEL: local_cluster_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -5007,12 +6625,18 @@ define amdgpu_kernel void @local_cluster_seq_cst_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: local_cluster_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 @@ -5025,12 +6649,18 @@ define amdgpu_kernel void @local_cluster_seq_cst_acquire_cmpxchg( ; ; GFX12-CU-LABEL: local_cluster_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 @@ -5043,12 +6673,18 @@ define amdgpu_kernel void @local_cluster_seq_cst_acquire_cmpxchg( ; GFX1250-LABEL: local_cluster_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -5066,12 +6702,16 @@ define amdgpu_kernel void @local_cluster_monotonic_seq_cst_cmpxchg( ; GFX6-LABEL: local_cluster_monotonic_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5081,12 +6721,18 @@ define amdgpu_kernel void @local_cluster_monotonic_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_cluster_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -5096,11 +6742,17 @@ define amdgpu_kernel void @local_cluster_monotonic_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_cluster_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5112,11 +6764,17 @@ define amdgpu_kernel void @local_cluster_monotonic_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_cluster_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5127,12 +6785,18 @@ define amdgpu_kernel void @local_cluster_monotonic_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_cluster_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -5142,11 +6806,17 @@ define amdgpu_kernel void @local_cluster_monotonic_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5156,11 +6826,17 @@ define amdgpu_kernel void @local_cluster_monotonic_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_cluster_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5170,11 +6846,17 @@ define amdgpu_kernel void @local_cluster_monotonic_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_cluster_monotonic_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5184,11 +6866,17 @@ define amdgpu_kernel void @local_cluster_monotonic_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_cluster_monotonic_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5198,12 +6886,18 @@ define amdgpu_kernel void @local_cluster_monotonic_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_cluster_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -5214,12 +6908,18 @@ define amdgpu_kernel void @local_cluster_monotonic_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: local_cluster_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -5229,12 +6929,18 @@ define amdgpu_kernel void @local_cluster_monotonic_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: local_cluster_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 @@ -5247,12 +6953,18 @@ define amdgpu_kernel void @local_cluster_monotonic_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: local_cluster_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 @@ -5265,12 +6977,18 @@ define amdgpu_kernel void @local_cluster_monotonic_seq_cst_cmpxchg( ; GFX1250-LABEL: local_cluster_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -5288,12 +7006,16 @@ define amdgpu_kernel void @local_cluster_acquire_seq_cst_cmpxchg( ; GFX6-LABEL: local_cluster_acquire_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5303,12 +7025,18 @@ define amdgpu_kernel void @local_cluster_acquire_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_cluster_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -5318,11 +7046,17 @@ define amdgpu_kernel void @local_cluster_acquire_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_cluster_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5334,11 +7068,17 @@ define amdgpu_kernel void @local_cluster_acquire_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_cluster_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5349,12 +7089,18 @@ define amdgpu_kernel void @local_cluster_acquire_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_cluster_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -5364,11 +7110,17 @@ define amdgpu_kernel void @local_cluster_acquire_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5378,11 +7130,17 @@ define amdgpu_kernel void @local_cluster_acquire_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_cluster_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5392,11 +7150,17 @@ define amdgpu_kernel void @local_cluster_acquire_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_cluster_acquire_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5406,11 +7170,17 @@ define amdgpu_kernel void @local_cluster_acquire_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_cluster_acquire_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5420,12 +7190,18 @@ define amdgpu_kernel void @local_cluster_acquire_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_cluster_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -5436,12 +7212,18 @@ define amdgpu_kernel void @local_cluster_acquire_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: local_cluster_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -5451,12 +7233,18 @@ define amdgpu_kernel void @local_cluster_acquire_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: local_cluster_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 @@ -5469,12 +7257,18 @@ define amdgpu_kernel void @local_cluster_acquire_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: local_cluster_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 @@ -5487,12 +7281,18 @@ define amdgpu_kernel void @local_cluster_acquire_seq_cst_cmpxchg( ; GFX1250-LABEL: local_cluster_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -5510,12 +7310,16 @@ define amdgpu_kernel void @local_cluster_release_seq_cst_cmpxchg( ; GFX6-LABEL: local_cluster_release_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5525,12 +7329,18 @@ define amdgpu_kernel void @local_cluster_release_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_cluster_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -5540,11 +7350,17 @@ define amdgpu_kernel void @local_cluster_release_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_cluster_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5556,11 +7372,17 @@ define amdgpu_kernel void @local_cluster_release_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_cluster_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5571,12 +7393,18 @@ define amdgpu_kernel void @local_cluster_release_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_cluster_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -5586,11 +7414,17 @@ define amdgpu_kernel void @local_cluster_release_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5600,11 +7434,17 @@ define amdgpu_kernel void @local_cluster_release_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_cluster_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5614,11 +7454,17 @@ define amdgpu_kernel void @local_cluster_release_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_cluster_release_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5628,11 +7474,17 @@ define amdgpu_kernel void @local_cluster_release_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_cluster_release_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5642,12 +7494,18 @@ define amdgpu_kernel void @local_cluster_release_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_cluster_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -5658,12 +7516,18 @@ define amdgpu_kernel void @local_cluster_release_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: local_cluster_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -5673,12 +7537,18 @@ define amdgpu_kernel void @local_cluster_release_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: local_cluster_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 @@ -5691,12 +7561,18 @@ define amdgpu_kernel void @local_cluster_release_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: local_cluster_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 @@ -5709,12 +7585,18 @@ define amdgpu_kernel void @local_cluster_release_seq_cst_cmpxchg( ; GFX1250-LABEL: local_cluster_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -5732,12 +7614,16 @@ define amdgpu_kernel void @local_cluster_acq_rel_seq_cst_cmpxchg( ; GFX6-LABEL: local_cluster_acq_rel_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5747,12 +7633,18 @@ define amdgpu_kernel void @local_cluster_acq_rel_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_cluster_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -5762,11 +7654,17 @@ define amdgpu_kernel void @local_cluster_acq_rel_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_cluster_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5778,11 +7676,17 @@ define amdgpu_kernel void @local_cluster_acq_rel_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_cluster_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5793,12 +7697,18 @@ define amdgpu_kernel void @local_cluster_acq_rel_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_cluster_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -5808,11 +7718,17 @@ define amdgpu_kernel void @local_cluster_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5822,11 +7738,17 @@ define amdgpu_kernel void @local_cluster_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_cluster_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5836,11 +7758,17 @@ define amdgpu_kernel void @local_cluster_acq_rel_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_cluster_acq_rel_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5850,11 +7778,17 @@ define amdgpu_kernel void @local_cluster_acq_rel_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_cluster_acq_rel_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5864,12 +7798,18 @@ define amdgpu_kernel void @local_cluster_acq_rel_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_cluster_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -5880,12 +7820,18 @@ define amdgpu_kernel void @local_cluster_acq_rel_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: local_cluster_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -5895,12 +7841,18 @@ define amdgpu_kernel void @local_cluster_acq_rel_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: local_cluster_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 @@ -5913,12 +7865,18 @@ define amdgpu_kernel void @local_cluster_acq_rel_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: local_cluster_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 @@ -5931,12 +7889,18 @@ define amdgpu_kernel void @local_cluster_acq_rel_seq_cst_cmpxchg( ; GFX1250-LABEL: local_cluster_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -5954,12 +7918,16 @@ define amdgpu_kernel void @local_cluster_seq_cst_seq_cst_cmpxchg( ; GFX6-LABEL: local_cluster_seq_cst_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5969,12 +7937,18 @@ define amdgpu_kernel void @local_cluster_seq_cst_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_cluster_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -5984,11 +7958,17 @@ define amdgpu_kernel void @local_cluster_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_cluster_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6000,11 +7980,17 @@ define amdgpu_kernel void @local_cluster_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_cluster_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6015,12 +8001,18 @@ define amdgpu_kernel void @local_cluster_seq_cst_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_cluster_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -6030,11 +8022,17 @@ define amdgpu_kernel void @local_cluster_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6044,11 +8042,17 @@ define amdgpu_kernel void @local_cluster_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_cluster_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6058,11 +8062,17 @@ define amdgpu_kernel void @local_cluster_seq_cst_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_cluster_seq_cst_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6072,11 +8082,17 @@ define amdgpu_kernel void @local_cluster_seq_cst_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_cluster_seq_cst_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6086,12 +8102,18 @@ define amdgpu_kernel void @local_cluster_seq_cst_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_cluster_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6102,12 +8124,18 @@ define amdgpu_kernel void @local_cluster_seq_cst_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: local_cluster_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6117,12 +8145,18 @@ define amdgpu_kernel void @local_cluster_seq_cst_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: local_cluster_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 @@ -6135,12 +8169,18 @@ define amdgpu_kernel void @local_cluster_seq_cst_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: local_cluster_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 @@ -6153,12 +8193,18 @@ define amdgpu_kernel void @local_cluster_seq_cst_seq_cst_cmpxchg( ; GFX1250-LABEL: local_cluster_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -6176,12 +8222,16 @@ define amdgpu_kernel void @local_cluster_monotonic_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_cluster_monotonic_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -6194,6 +8244,12 @@ define amdgpu_kernel void @local_cluster_monotonic_monotonic_ret_cmpxchg( ; GFX7-LABEL: local_cluster_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -6211,6 +8267,12 @@ define amdgpu_kernel void @local_cluster_monotonic_monotonic_ret_cmpxchg( ; GFX10-WGP-LABEL: local_cluster_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -6226,6 +8288,12 @@ define amdgpu_kernel void @local_cluster_monotonic_monotonic_ret_cmpxchg( ; GFX10-CU-LABEL: local_cluster_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6241,6 +8309,12 @@ define amdgpu_kernel void @local_cluster_monotonic_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_cluster_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -6258,6 +8332,12 @@ define amdgpu_kernel void @local_cluster_monotonic_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6273,6 +8353,12 @@ define amdgpu_kernel void @local_cluster_monotonic_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_cluster_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6288,6 +8374,12 @@ define amdgpu_kernel void @local_cluster_monotonic_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_cluster_monotonic_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6303,6 +8395,12 @@ define amdgpu_kernel void @local_cluster_monotonic_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_cluster_monotonic_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6318,6 +8416,12 @@ define amdgpu_kernel void @local_cluster_monotonic_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: local_cluster_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -6333,6 +8437,12 @@ define amdgpu_kernel void @local_cluster_monotonic_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: local_cluster_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6348,6 +8458,12 @@ define amdgpu_kernel void @local_cluster_monotonic_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_cluster_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -6363,6 +8479,12 @@ define amdgpu_kernel void @local_cluster_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_cluster_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -6379,6 +8501,12 @@ define amdgpu_kernel void @local_cluster_monotonic_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -6403,12 +8531,16 @@ define amdgpu_kernel void @local_cluster_acquire_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_cluster_acquire_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -6421,6 +8553,12 @@ define amdgpu_kernel void @local_cluster_acquire_monotonic_ret_cmpxchg( ; GFX7-LABEL: local_cluster_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -6438,6 +8576,12 @@ define amdgpu_kernel void @local_cluster_acquire_monotonic_ret_cmpxchg( ; GFX10-WGP-LABEL: local_cluster_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -6454,6 +8598,12 @@ define amdgpu_kernel void @local_cluster_acquire_monotonic_ret_cmpxchg( ; GFX10-CU-LABEL: local_cluster_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6469,6 +8619,12 @@ define amdgpu_kernel void @local_cluster_acquire_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_cluster_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -6486,6 +8642,12 @@ define amdgpu_kernel void @local_cluster_acquire_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6501,6 +8663,12 @@ define amdgpu_kernel void @local_cluster_acquire_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_cluster_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6517,6 +8685,12 @@ define amdgpu_kernel void @local_cluster_acquire_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_cluster_acquire_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6532,6 +8706,12 @@ define amdgpu_kernel void @local_cluster_acquire_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_cluster_acquire_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6548,6 +8728,12 @@ define amdgpu_kernel void @local_cluster_acquire_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: local_cluster_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -6564,6 +8750,12 @@ define amdgpu_kernel void @local_cluster_acquire_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: local_cluster_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6579,6 +8771,12 @@ define amdgpu_kernel void @local_cluster_acquire_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_cluster_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -6595,6 +8793,12 @@ define amdgpu_kernel void @local_cluster_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_cluster_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -6611,6 +8815,12 @@ define amdgpu_kernel void @local_cluster_acquire_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -6635,12 +8845,16 @@ define amdgpu_kernel void @local_cluster_release_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_cluster_release_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6654,6 +8868,12 @@ define amdgpu_kernel void @local_cluster_release_monotonic_ret_cmpxchg( ; GFX7-LABEL: local_cluster_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -6672,6 +8892,12 @@ define amdgpu_kernel void @local_cluster_release_monotonic_ret_cmpxchg( ; GFX10-WGP-LABEL: local_cluster_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -6689,6 +8915,12 @@ define amdgpu_kernel void @local_cluster_release_monotonic_ret_cmpxchg( ; GFX10-CU-LABEL: local_cluster_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6706,6 +8938,12 @@ define amdgpu_kernel void @local_cluster_release_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_cluster_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -6724,6 +8962,12 @@ define amdgpu_kernel void @local_cluster_release_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6740,6 +8984,12 @@ define amdgpu_kernel void @local_cluster_release_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_cluster_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6756,6 +9006,12 @@ define amdgpu_kernel void @local_cluster_release_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_cluster_release_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6772,6 +9028,12 @@ define amdgpu_kernel void @local_cluster_release_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_cluster_release_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6788,6 +9050,12 @@ define amdgpu_kernel void @local_cluster_release_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: local_cluster_release_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -6805,6 +9073,12 @@ define amdgpu_kernel void @local_cluster_release_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: local_cluster_release_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6822,6 +9096,12 @@ define amdgpu_kernel void @local_cluster_release_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_cluster_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -6841,6 +9121,12 @@ define amdgpu_kernel void @local_cluster_release_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_cluster_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -6861,6 +9147,12 @@ define amdgpu_kernel void @local_cluster_release_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -6887,12 +9179,16 @@ define amdgpu_kernel void @local_cluster_acq_rel_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_cluster_acq_rel_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6906,6 +9202,12 @@ define amdgpu_kernel void @local_cluster_acq_rel_monotonic_ret_cmpxchg( ; GFX7-LABEL: local_cluster_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -6924,6 +9226,12 @@ define amdgpu_kernel void @local_cluster_acq_rel_monotonic_ret_cmpxchg( ; GFX10-WGP-LABEL: local_cluster_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -6942,6 +9250,12 @@ define amdgpu_kernel void @local_cluster_acq_rel_monotonic_ret_cmpxchg( ; GFX10-CU-LABEL: local_cluster_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6959,6 +9273,12 @@ define amdgpu_kernel void @local_cluster_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_cluster_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -6977,6 +9297,12 @@ define amdgpu_kernel void @local_cluster_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6993,6 +9319,12 @@ define amdgpu_kernel void @local_cluster_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_cluster_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7010,6 +9342,12 @@ define amdgpu_kernel void @local_cluster_acq_rel_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_cluster_acq_rel_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7026,6 +9364,12 @@ define amdgpu_kernel void @local_cluster_acq_rel_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_cluster_acq_rel_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7043,6 +9387,12 @@ define amdgpu_kernel void @local_cluster_acq_rel_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: local_cluster_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7061,6 +9411,12 @@ define amdgpu_kernel void @local_cluster_acq_rel_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: local_cluster_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -7078,6 +9434,12 @@ define amdgpu_kernel void @local_cluster_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_cluster_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -7098,6 +9460,12 @@ define amdgpu_kernel void @local_cluster_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_cluster_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -7118,6 +9486,12 @@ define amdgpu_kernel void @local_cluster_acq_rel_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -7144,12 +9518,16 @@ define amdgpu_kernel void @local_cluster_seq_cst_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_cluster_seq_cst_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7163,6 +9541,12 @@ define amdgpu_kernel void @local_cluster_seq_cst_monotonic_ret_cmpxchg( ; GFX7-LABEL: local_cluster_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -7181,6 +9565,12 @@ define amdgpu_kernel void @local_cluster_seq_cst_monotonic_ret_cmpxchg( ; GFX10-WGP-LABEL: local_cluster_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7199,6 +9589,12 @@ define amdgpu_kernel void @local_cluster_seq_cst_monotonic_ret_cmpxchg( ; GFX10-CU-LABEL: local_cluster_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -7216,6 +9612,12 @@ define amdgpu_kernel void @local_cluster_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_cluster_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -7234,6 +9636,12 @@ define amdgpu_kernel void @local_cluster_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7250,6 +9658,12 @@ define amdgpu_kernel void @local_cluster_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_cluster_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7267,6 +9681,12 @@ define amdgpu_kernel void @local_cluster_seq_cst_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_cluster_seq_cst_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7283,6 +9703,12 @@ define amdgpu_kernel void @local_cluster_seq_cst_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_cluster_seq_cst_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7300,6 +9726,12 @@ define amdgpu_kernel void @local_cluster_seq_cst_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: local_cluster_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7318,6 +9750,12 @@ define amdgpu_kernel void @local_cluster_seq_cst_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: local_cluster_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -7335,6 +9773,12 @@ define amdgpu_kernel void @local_cluster_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_cluster_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -7355,6 +9799,12 @@ define amdgpu_kernel void @local_cluster_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_cluster_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -7375,6 +9825,12 @@ define amdgpu_kernel void @local_cluster_seq_cst_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -7401,12 +9857,16 @@ define amdgpu_kernel void @local_cluster_monotonic_acquire_ret_cmpxchg( ; GFX6-LABEL: local_cluster_monotonic_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -7419,6 +9879,12 @@ define amdgpu_kernel void @local_cluster_monotonic_acquire_ret_cmpxchg( ; GFX7-LABEL: local_cluster_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -7436,6 +9902,12 @@ define amdgpu_kernel void @local_cluster_monotonic_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: local_cluster_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7452,6 +9924,12 @@ define amdgpu_kernel void @local_cluster_monotonic_acquire_ret_cmpxchg( ; GFX10-CU-LABEL: local_cluster_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -7467,6 +9945,12 @@ define amdgpu_kernel void @local_cluster_monotonic_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_cluster_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -7484,6 +9968,12 @@ define amdgpu_kernel void @local_cluster_monotonic_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7499,6 +9989,12 @@ define amdgpu_kernel void @local_cluster_monotonic_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_cluster_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7515,6 +10011,12 @@ define amdgpu_kernel void @local_cluster_monotonic_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_cluster_monotonic_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7530,6 +10032,12 @@ define amdgpu_kernel void @local_cluster_monotonic_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_cluster_monotonic_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7546,6 +10054,12 @@ define amdgpu_kernel void @local_cluster_monotonic_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: local_cluster_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7562,6 +10076,12 @@ define amdgpu_kernel void @local_cluster_monotonic_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: local_cluster_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -7577,6 +10097,12 @@ define amdgpu_kernel void @local_cluster_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_cluster_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -7593,6 +10119,12 @@ define amdgpu_kernel void @local_cluster_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_cluster_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -7609,6 +10141,12 @@ define amdgpu_kernel void @local_cluster_monotonic_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -7633,12 +10171,16 @@ define amdgpu_kernel void @local_cluster_acquire_acquire_ret_cmpxchg( ; GFX6-LABEL: local_cluster_acquire_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -7651,6 +10193,12 @@ define amdgpu_kernel void @local_cluster_acquire_acquire_ret_cmpxchg( ; GFX7-LABEL: local_cluster_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -7668,6 +10216,12 @@ define amdgpu_kernel void @local_cluster_acquire_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: local_cluster_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7684,6 +10238,12 @@ define amdgpu_kernel void @local_cluster_acquire_acquire_ret_cmpxchg( ; GFX10-CU-LABEL: local_cluster_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -7699,6 +10259,12 @@ define amdgpu_kernel void @local_cluster_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_cluster_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -7716,6 +10282,12 @@ define amdgpu_kernel void @local_cluster_acquire_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7731,6 +10303,12 @@ define amdgpu_kernel void @local_cluster_acquire_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_cluster_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7747,6 +10325,12 @@ define amdgpu_kernel void @local_cluster_acquire_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_cluster_acquire_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7762,6 +10346,12 @@ define amdgpu_kernel void @local_cluster_acquire_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_cluster_acquire_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7778,6 +10368,12 @@ define amdgpu_kernel void @local_cluster_acquire_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: local_cluster_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7794,6 +10390,12 @@ define amdgpu_kernel void @local_cluster_acquire_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: local_cluster_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -7809,6 +10411,12 @@ define amdgpu_kernel void @local_cluster_acquire_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_cluster_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -7825,6 +10433,12 @@ define amdgpu_kernel void @local_cluster_acquire_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_cluster_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -7841,6 +10455,12 @@ define amdgpu_kernel void @local_cluster_acquire_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -7865,12 +10485,16 @@ define amdgpu_kernel void @local_cluster_release_acquire_ret_cmpxchg( ; GFX6-LABEL: local_cluster_release_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7884,6 +10508,12 @@ define amdgpu_kernel void @local_cluster_release_acquire_ret_cmpxchg( ; GFX7-LABEL: local_cluster_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -7902,6 +10532,12 @@ define amdgpu_kernel void @local_cluster_release_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: local_cluster_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7920,6 +10556,12 @@ define amdgpu_kernel void @local_cluster_release_acquire_ret_cmpxchg( ; GFX10-CU-LABEL: local_cluster_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -7937,6 +10579,12 @@ define amdgpu_kernel void @local_cluster_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_cluster_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -7955,6 +10603,12 @@ define amdgpu_kernel void @local_cluster_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7971,6 +10625,12 @@ define amdgpu_kernel void @local_cluster_release_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_cluster_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7988,6 +10648,12 @@ define amdgpu_kernel void @local_cluster_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_cluster_release_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8004,6 +10670,12 @@ define amdgpu_kernel void @local_cluster_release_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_cluster_release_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8021,6 +10693,12 @@ define amdgpu_kernel void @local_cluster_release_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: local_cluster_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -8039,6 +10717,12 @@ define amdgpu_kernel void @local_cluster_release_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: local_cluster_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -8056,6 +10740,12 @@ define amdgpu_kernel void @local_cluster_release_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_cluster_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -8076,6 +10766,12 @@ define amdgpu_kernel void @local_cluster_release_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_cluster_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -8096,6 +10792,12 @@ define amdgpu_kernel void @local_cluster_release_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -8122,12 +10824,16 @@ define amdgpu_kernel void @local_cluster_acq_rel_acquire_ret_cmpxchg( ; GFX6-LABEL: local_cluster_acq_rel_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -8141,6 +10847,12 @@ define amdgpu_kernel void @local_cluster_acq_rel_acquire_ret_cmpxchg( ; GFX7-LABEL: local_cluster_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -8159,6 +10871,12 @@ define amdgpu_kernel void @local_cluster_acq_rel_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: local_cluster_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -8177,6 +10895,12 @@ define amdgpu_kernel void @local_cluster_acq_rel_acquire_ret_cmpxchg( ; GFX10-CU-LABEL: local_cluster_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -8194,6 +10918,12 @@ define amdgpu_kernel void @local_cluster_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_cluster_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -8212,6 +10942,12 @@ define amdgpu_kernel void @local_cluster_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8228,6 +10964,12 @@ define amdgpu_kernel void @local_cluster_acq_rel_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_cluster_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8245,6 +10987,12 @@ define amdgpu_kernel void @local_cluster_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_cluster_acq_rel_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8261,6 +11009,12 @@ define amdgpu_kernel void @local_cluster_acq_rel_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_cluster_acq_rel_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8278,6 +11032,12 @@ define amdgpu_kernel void @local_cluster_acq_rel_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: local_cluster_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -8296,6 +11056,12 @@ define amdgpu_kernel void @local_cluster_acq_rel_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: local_cluster_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -8313,6 +11079,12 @@ define amdgpu_kernel void @local_cluster_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_cluster_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -8333,6 +11105,12 @@ define amdgpu_kernel void @local_cluster_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_cluster_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -8353,6 +11131,12 @@ define amdgpu_kernel void @local_cluster_acq_rel_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -8379,12 +11163,16 @@ define amdgpu_kernel void @local_cluster_seq_cst_acquire_ret_cmpxchg( ; GFX6-LABEL: local_cluster_seq_cst_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -8398,6 +11186,12 @@ define amdgpu_kernel void @local_cluster_seq_cst_acquire_ret_cmpxchg( ; GFX7-LABEL: local_cluster_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -8416,6 +11210,12 @@ define amdgpu_kernel void @local_cluster_seq_cst_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: local_cluster_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -8434,6 +11234,12 @@ define amdgpu_kernel void @local_cluster_seq_cst_acquire_ret_cmpxchg( ; GFX10-CU-LABEL: local_cluster_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -8451,6 +11257,12 @@ define amdgpu_kernel void @local_cluster_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_cluster_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -8469,6 +11281,12 @@ define amdgpu_kernel void @local_cluster_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8485,6 +11303,12 @@ define amdgpu_kernel void @local_cluster_seq_cst_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_cluster_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8502,6 +11326,12 @@ define amdgpu_kernel void @local_cluster_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_cluster_seq_cst_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8518,6 +11348,12 @@ define amdgpu_kernel void @local_cluster_seq_cst_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_cluster_seq_cst_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8535,6 +11371,12 @@ define amdgpu_kernel void @local_cluster_seq_cst_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: local_cluster_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -8553,6 +11395,12 @@ define amdgpu_kernel void @local_cluster_seq_cst_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: local_cluster_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -8570,6 +11418,12 @@ define amdgpu_kernel void @local_cluster_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_cluster_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -8590,6 +11444,12 @@ define amdgpu_kernel void @local_cluster_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_cluster_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -8610,6 +11470,12 @@ define amdgpu_kernel void @local_cluster_seq_cst_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -8636,12 +11502,16 @@ define amdgpu_kernel void @local_cluster_monotonic_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_cluster_monotonic_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -8655,6 +11525,12 @@ define amdgpu_kernel void @local_cluster_monotonic_seq_cst_ret_cmpxchg( ; GFX7-LABEL: local_cluster_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -8673,6 +11549,12 @@ define amdgpu_kernel void @local_cluster_monotonic_seq_cst_ret_cmpxchg( ; GFX10-WGP-LABEL: local_cluster_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -8691,6 +11573,12 @@ define amdgpu_kernel void @local_cluster_monotonic_seq_cst_ret_cmpxchg( ; GFX10-CU-LABEL: local_cluster_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -8708,6 +11596,12 @@ define amdgpu_kernel void @local_cluster_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_cluster_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -8726,6 +11620,12 @@ define amdgpu_kernel void @local_cluster_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8742,6 +11642,12 @@ define amdgpu_kernel void @local_cluster_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_cluster_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8759,6 +11665,12 @@ define amdgpu_kernel void @local_cluster_monotonic_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_cluster_monotonic_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8775,6 +11687,12 @@ define amdgpu_kernel void @local_cluster_monotonic_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_cluster_monotonic_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8792,6 +11710,12 @@ define amdgpu_kernel void @local_cluster_monotonic_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: local_cluster_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -8810,6 +11734,12 @@ define amdgpu_kernel void @local_cluster_monotonic_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: local_cluster_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -8827,6 +11757,12 @@ define amdgpu_kernel void @local_cluster_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_cluster_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -8847,6 +11783,12 @@ define amdgpu_kernel void @local_cluster_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_cluster_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -8867,6 +11809,12 @@ define amdgpu_kernel void @local_cluster_monotonic_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -8893,12 +11841,16 @@ define amdgpu_kernel void @local_cluster_acquire_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_cluster_acquire_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -8912,6 +11864,12 @@ define amdgpu_kernel void @local_cluster_acquire_seq_cst_ret_cmpxchg( ; GFX7-LABEL: local_cluster_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -8930,6 +11888,12 @@ define amdgpu_kernel void @local_cluster_acquire_seq_cst_ret_cmpxchg( ; GFX10-WGP-LABEL: local_cluster_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -8948,6 +11912,12 @@ define amdgpu_kernel void @local_cluster_acquire_seq_cst_ret_cmpxchg( ; GFX10-CU-LABEL: local_cluster_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -8965,6 +11935,12 @@ define amdgpu_kernel void @local_cluster_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_cluster_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -8983,6 +11959,12 @@ define amdgpu_kernel void @local_cluster_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8999,6 +11981,12 @@ define amdgpu_kernel void @local_cluster_acquire_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_cluster_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9016,6 +12004,12 @@ define amdgpu_kernel void @local_cluster_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_cluster_acquire_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9032,6 +12026,12 @@ define amdgpu_kernel void @local_cluster_acquire_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_cluster_acquire_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9049,6 +12049,12 @@ define amdgpu_kernel void @local_cluster_acquire_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: local_cluster_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -9067,6 +12073,12 @@ define amdgpu_kernel void @local_cluster_acquire_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: local_cluster_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -9084,6 +12096,12 @@ define amdgpu_kernel void @local_cluster_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_cluster_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -9104,6 +12122,12 @@ define amdgpu_kernel void @local_cluster_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_cluster_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -9124,6 +12148,12 @@ define amdgpu_kernel void @local_cluster_acquire_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -9150,12 +12180,16 @@ define amdgpu_kernel void @local_cluster_release_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_cluster_release_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -9169,6 +12203,12 @@ define amdgpu_kernel void @local_cluster_release_seq_cst_ret_cmpxchg( ; GFX7-LABEL: local_cluster_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -9187,6 +12227,12 @@ define amdgpu_kernel void @local_cluster_release_seq_cst_ret_cmpxchg( ; GFX10-WGP-LABEL: local_cluster_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -9205,6 +12251,12 @@ define amdgpu_kernel void @local_cluster_release_seq_cst_ret_cmpxchg( ; GFX10-CU-LABEL: local_cluster_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -9222,6 +12274,12 @@ define amdgpu_kernel void @local_cluster_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_cluster_release_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -9240,6 +12298,12 @@ define amdgpu_kernel void @local_cluster_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9256,6 +12320,12 @@ define amdgpu_kernel void @local_cluster_release_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_cluster_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9273,6 +12343,12 @@ define amdgpu_kernel void @local_cluster_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_cluster_release_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9289,6 +12365,12 @@ define amdgpu_kernel void @local_cluster_release_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_cluster_release_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9306,6 +12388,12 @@ define amdgpu_kernel void @local_cluster_release_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: local_cluster_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -9324,6 +12412,12 @@ define amdgpu_kernel void @local_cluster_release_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: local_cluster_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -9341,6 +12435,12 @@ define amdgpu_kernel void @local_cluster_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_cluster_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -9361,6 +12461,12 @@ define amdgpu_kernel void @local_cluster_release_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_cluster_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -9381,6 +12487,12 @@ define amdgpu_kernel void @local_cluster_release_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -9407,12 +12519,16 @@ define amdgpu_kernel void @local_cluster_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_cluster_acq_rel_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -9426,6 +12542,12 @@ define amdgpu_kernel void @local_cluster_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-LABEL: local_cluster_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -9444,6 +12566,12 @@ define amdgpu_kernel void @local_cluster_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-WGP-LABEL: local_cluster_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -9462,6 +12590,12 @@ define amdgpu_kernel void @local_cluster_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-CU-LABEL: local_cluster_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -9479,6 +12613,12 @@ define amdgpu_kernel void @local_cluster_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_cluster_acq_rel_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -9497,6 +12637,12 @@ define amdgpu_kernel void @local_cluster_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9513,6 +12659,12 @@ define amdgpu_kernel void @local_cluster_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_cluster_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9530,6 +12682,12 @@ define amdgpu_kernel void @local_cluster_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_cluster_acq_rel_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9546,6 +12704,12 @@ define amdgpu_kernel void @local_cluster_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_cluster_acq_rel_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9563,6 +12727,12 @@ define amdgpu_kernel void @local_cluster_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: local_cluster_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -9581,6 +12751,12 @@ define amdgpu_kernel void @local_cluster_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: local_cluster_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -9598,6 +12774,12 @@ define amdgpu_kernel void @local_cluster_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_cluster_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -9618,6 +12800,12 @@ define amdgpu_kernel void @local_cluster_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_cluster_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -9638,6 +12826,12 @@ define amdgpu_kernel void @local_cluster_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -9664,12 +12858,16 @@ define amdgpu_kernel void @local_cluster_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_cluster_seq_cst_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -9683,6 +12881,12 @@ define amdgpu_kernel void @local_cluster_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-LABEL: local_cluster_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -9701,6 +12905,12 @@ define amdgpu_kernel void @local_cluster_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-WGP-LABEL: local_cluster_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -9719,6 +12929,12 @@ define amdgpu_kernel void @local_cluster_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-CU-LABEL: local_cluster_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -9736,6 +12952,12 @@ define amdgpu_kernel void @local_cluster_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_cluster_seq_cst_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -9754,6 +12976,12 @@ define amdgpu_kernel void @local_cluster_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9770,6 +12998,12 @@ define amdgpu_kernel void @local_cluster_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_cluster_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9787,6 +13021,12 @@ define amdgpu_kernel void @local_cluster_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_cluster_seq_cst_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9803,6 +13043,12 @@ define amdgpu_kernel void @local_cluster_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_cluster_seq_cst_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9820,6 +13066,12 @@ define amdgpu_kernel void @local_cluster_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: local_cluster_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -9838,6 +13090,12 @@ define amdgpu_kernel void @local_cluster_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: local_cluster_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -9855,6 +13113,12 @@ define amdgpu_kernel void @local_cluster_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_cluster_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -9875,6 +13139,12 @@ define amdgpu_kernel void @local_cluster_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_cluster_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -9895,6 +13165,12 @@ define amdgpu_kernel void @local_cluster_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -9936,135 +13212,169 @@ define amdgpu_kernel void @local_cluster_one_as_unordered_load( ; ; GFX7-LABEL: local_cluster_one_as_unordered_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: ds_read_b32 v1, v0 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_cluster_one_as_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_cluster_one_as_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_cluster_one_as_unordered_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_unordered_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_cluster_one_as_unordered_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_cluster_one_as_unordered_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_cluster_one_as_unordered_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_cluster_one_as_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: ds_load_b32 v1, v0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: ds_store_b32 v0, v1 @@ -10072,11 +13382,15 @@ define amdgpu_kernel void @local_cluster_one_as_unordered_load( ; ; GFX12-CU-LABEL: local_cluster_one_as_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 @@ -10085,11 +13399,15 @@ define amdgpu_kernel void @local_cluster_one_as_unordered_load( ; GFX1250-LABEL: local_cluster_one_as_unordered_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: ds_store_b32 v0, v1 @@ -10120,135 +13438,169 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_load( ; ; GFX7-LABEL: local_cluster_one_as_monotonic_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: ds_read_b32 v1, v0 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_cluster_one_as_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_cluster_one_as_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_cluster_one_as_monotonic_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_monotonic_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_cluster_one_as_monotonic_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_cluster_one_as_monotonic_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_cluster_one_as_monotonic_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_cluster_one_as_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: ds_load_b32 v1, v0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: ds_store_b32 v0, v1 @@ -10256,11 +13608,15 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_load( ; ; GFX12-CU-LABEL: local_cluster_one_as_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 @@ -10269,11 +13625,15 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_load( ; GFX1250-LABEL: local_cluster_one_as_monotonic_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: ds_store_b32 v0, v1 @@ -10304,135 +13664,169 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_load( ; ; GFX7-LABEL: local_cluster_one_as_acquire_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: ds_read_b32 v1, v0 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_cluster_one_as_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_cluster_one_as_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_cluster_one_as_acquire_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_acquire_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_cluster_one_as_acquire_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_cluster_one_as_acquire_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_cluster_one_as_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_cluster_one_as_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: ds_load_b32 v1, v0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: ds_store_b32 v0, v1 @@ -10440,11 +13834,15 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_load( ; ; GFX12-CU-LABEL: local_cluster_one_as_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 @@ -10453,11 +13851,15 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_load( ; GFX1250-LABEL: local_cluster_one_as_acquire_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: ds_store_b32 v0, v1 @@ -10488,135 +13890,169 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_load( ; ; GFX7-LABEL: local_cluster_one_as_seq_cst_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: ds_read_b32 v1, v0 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_cluster_one_as_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_cluster_one_as_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_cluster_one_as_seq_cst_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_seq_cst_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_cluster_one_as_seq_cst_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_cluster_one_as_seq_cst_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_cluster_one_as_seq_cst_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_cluster_one_as_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: ds_load_b32 v1, v0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: ds_store_b32 v0, v1 @@ -10624,11 +14060,15 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_load( ; ; GFX12-CU-LABEL: local_cluster_one_as_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 @@ -10637,11 +14077,15 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_load( ; GFX1250-LABEL: local_cluster_one_as_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: ds_store_b32 v0, v1 @@ -10656,12 +14100,14 @@ entry: define amdgpu_kernel void @local_cluster_one_as_unordered_store( ; GFX6-LABEL: local_cluster_one_as_unordered_store: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm @@ -10669,6 +14115,10 @@ define amdgpu_kernel void @local_cluster_one_as_unordered_store( ; GFX7-LABEL: local_cluster_one_as_unordered_store: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10680,6 +14130,10 @@ define amdgpu_kernel void @local_cluster_one_as_unordered_store( ; GFX10-WGP-LABEL: local_cluster_one_as_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 @@ -10690,6 +14144,10 @@ define amdgpu_kernel void @local_cluster_one_as_unordered_store( ; GFX10-CU-LABEL: local_cluster_one_as_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 @@ -10700,6 +14158,10 @@ define amdgpu_kernel void @local_cluster_one_as_unordered_store( ; SKIP-CACHE-INV-LABEL: local_cluster_one_as_unordered_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -10711,6 +14173,10 @@ define amdgpu_kernel void @local_cluster_one_as_unordered_store( ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -10721,6 +14187,10 @@ define amdgpu_kernel void @local_cluster_one_as_unordered_store( ; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -10731,6 +14201,10 @@ define amdgpu_kernel void @local_cluster_one_as_unordered_store( ; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_unordered_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -10741,6 +14215,10 @@ define amdgpu_kernel void @local_cluster_one_as_unordered_store( ; GFX942-TGSPLIT-LABEL: local_cluster_one_as_unordered_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -10751,6 +14229,10 @@ define amdgpu_kernel void @local_cluster_one_as_unordered_store( ; GFX11-WGP-LABEL: local_cluster_one_as_unordered_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -10761,6 +14243,10 @@ define amdgpu_kernel void @local_cluster_one_as_unordered_store( ; GFX11-CU-LABEL: local_cluster_one_as_unordered_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -10771,6 +14257,10 @@ define amdgpu_kernel void @local_cluster_one_as_unordered_store( ; GFX12-WGP-LABEL: local_cluster_one_as_unordered_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -10781,6 +14271,10 @@ define amdgpu_kernel void @local_cluster_one_as_unordered_store( ; GFX12-CU-LABEL: local_cluster_one_as_unordered_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -10792,6 +14286,10 @@ define amdgpu_kernel void @local_cluster_one_as_unordered_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 @@ -10807,12 +14305,14 @@ entry: define amdgpu_kernel void @local_cluster_one_as_monotonic_store( ; GFX6-LABEL: local_cluster_one_as_monotonic_store: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm @@ -10820,6 +14320,10 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_store( ; GFX7-LABEL: local_cluster_one_as_monotonic_store: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10831,6 +14335,10 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_store( ; GFX10-WGP-LABEL: local_cluster_one_as_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 @@ -10841,6 +14349,10 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_store( ; GFX10-CU-LABEL: local_cluster_one_as_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 @@ -10851,6 +14363,10 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_store( ; SKIP-CACHE-INV-LABEL: local_cluster_one_as_monotonic_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -10862,6 +14378,10 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_store( ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -10872,6 +14392,10 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_store( ; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -10882,6 +14406,10 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_store( ; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_monotonic_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -10892,6 +14420,10 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_store( ; GFX942-TGSPLIT-LABEL: local_cluster_one_as_monotonic_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -10902,6 +14434,10 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_store( ; GFX11-WGP-LABEL: local_cluster_one_as_monotonic_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -10912,6 +14448,10 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_store( ; GFX11-CU-LABEL: local_cluster_one_as_monotonic_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -10922,6 +14462,10 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_store( ; GFX12-WGP-LABEL: local_cluster_one_as_monotonic_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -10932,6 +14476,10 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_store( ; GFX12-CU-LABEL: local_cluster_one_as_monotonic_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -10943,6 +14491,10 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 @@ -10958,12 +14510,14 @@ entry: define amdgpu_kernel void @local_cluster_one_as_release_store( ; GFX6-LABEL: local_cluster_one_as_release_store: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm @@ -10971,6 +14525,10 @@ define amdgpu_kernel void @local_cluster_one_as_release_store( ; GFX7-LABEL: local_cluster_one_as_release_store: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10982,6 +14540,10 @@ define amdgpu_kernel void @local_cluster_one_as_release_store( ; GFX10-WGP-LABEL: local_cluster_one_as_release_store: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 @@ -10992,6 +14554,10 @@ define amdgpu_kernel void @local_cluster_one_as_release_store( ; GFX10-CU-LABEL: local_cluster_one_as_release_store: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 @@ -11002,6 +14568,10 @@ define amdgpu_kernel void @local_cluster_one_as_release_store( ; SKIP-CACHE-INV-LABEL: local_cluster_one_as_release_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -11013,6 +14583,10 @@ define amdgpu_kernel void @local_cluster_one_as_release_store( ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -11023,6 +14597,10 @@ define amdgpu_kernel void @local_cluster_one_as_release_store( ; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -11033,6 +14611,10 @@ define amdgpu_kernel void @local_cluster_one_as_release_store( ; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_release_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -11043,6 +14625,10 @@ define amdgpu_kernel void @local_cluster_one_as_release_store( ; GFX942-TGSPLIT-LABEL: local_cluster_one_as_release_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -11053,6 +14639,10 @@ define amdgpu_kernel void @local_cluster_one_as_release_store( ; GFX11-WGP-LABEL: local_cluster_one_as_release_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -11063,6 +14653,10 @@ define amdgpu_kernel void @local_cluster_one_as_release_store( ; GFX11-CU-LABEL: local_cluster_one_as_release_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -11073,6 +14667,10 @@ define amdgpu_kernel void @local_cluster_one_as_release_store( ; GFX12-WGP-LABEL: local_cluster_one_as_release_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -11083,6 +14681,10 @@ define amdgpu_kernel void @local_cluster_one_as_release_store( ; GFX12-CU-LABEL: local_cluster_one_as_release_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -11094,6 +14696,10 @@ define amdgpu_kernel void @local_cluster_one_as_release_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 @@ -11109,12 +14715,14 @@ entry: define amdgpu_kernel void @local_cluster_one_as_seq_cst_store( ; GFX6-LABEL: local_cluster_one_as_seq_cst_store: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm @@ -11122,6 +14730,10 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_store( ; GFX7-LABEL: local_cluster_one_as_seq_cst_store: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11133,6 +14745,10 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_store( ; GFX10-WGP-LABEL: local_cluster_one_as_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 @@ -11143,6 +14759,10 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_store( ; GFX10-CU-LABEL: local_cluster_one_as_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 @@ -11153,6 +14773,10 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_store( ; SKIP-CACHE-INV-LABEL: local_cluster_one_as_seq_cst_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -11164,6 +14788,10 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_store( ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -11174,6 +14802,10 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_store( ; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -11184,6 +14816,10 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_store( ; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_seq_cst_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -11194,6 +14830,10 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_store( ; GFX942-TGSPLIT-LABEL: local_cluster_one_as_seq_cst_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -11204,6 +14844,10 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_store( ; GFX11-WGP-LABEL: local_cluster_one_as_seq_cst_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -11214,6 +14858,10 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_store( ; GFX11-CU-LABEL: local_cluster_one_as_seq_cst_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -11224,6 +14872,10 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_store( ; GFX12-WGP-LABEL: local_cluster_one_as_seq_cst_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -11234,6 +14886,10 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_store( ; GFX12-CU-LABEL: local_cluster_one_as_seq_cst_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -11245,6 +14901,10 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 @@ -11261,133 +14921,183 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_atomicrmw( ; GFX6-LABEL: local_cluster_one_as_monotonic_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_cluster_one_as_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_cluster_one_as_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_cluster_one_as_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_cluster_one_as_monotonic_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_monotonic_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_cluster_one_as_monotonic_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_cluster_one_as_monotonic_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_cluster_one_as_monotonic_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_cluster_one_as_monotonic_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_cluster_one_as_monotonic_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm @@ -11395,10 +15105,14 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_atomicrmw( ; GFX1250-LABEL: local_cluster_one_as_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s0 ; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX1250-NEXT: s_endpgm @@ -11412,133 +15126,183 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_atomicrmw( ; GFX6-LABEL: local_cluster_one_as_acquire_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_cluster_one_as_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_cluster_one_as_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_cluster_one_as_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_cluster_one_as_acquire_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_acquire_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_cluster_one_as_acquire_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_cluster_one_as_acquire_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_cluster_one_as_acquire_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_cluster_one_as_acquire_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_cluster_one_as_acquire_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm @@ -11546,10 +15310,14 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_atomicrmw( ; GFX1250-LABEL: local_cluster_one_as_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s0 ; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX1250-NEXT: s_endpgm @@ -11563,133 +15331,183 @@ define amdgpu_kernel void @local_cluster_one_as_release_atomicrmw( ; GFX6-LABEL: local_cluster_one_as_release_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_cluster_one_as_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_cluster_one_as_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_cluster_one_as_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_cluster_one_as_release_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_release_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_cluster_one_as_release_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_cluster_one_as_release_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_cluster_one_as_release_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_cluster_one_as_release_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_cluster_one_as_release_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm @@ -11697,10 +15515,14 @@ define amdgpu_kernel void @local_cluster_one_as_release_atomicrmw( ; GFX1250-LABEL: local_cluster_one_as_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s0 ; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX1250-NEXT: s_endpgm @@ -11714,133 +15536,183 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_atomicrmw( ; GFX6-LABEL: local_cluster_one_as_acq_rel_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_cluster_one_as_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_cluster_one_as_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_cluster_one_as_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_cluster_one_as_acq_rel_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_acq_rel_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_cluster_one_as_acq_rel_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_cluster_one_as_acq_rel_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_cluster_one_as_acq_rel_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_cluster_one_as_acq_rel_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_cluster_one_as_acq_rel_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm @@ -11848,10 +15720,14 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_atomicrmw( ; GFX1250-LABEL: local_cluster_one_as_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s0 ; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX1250-NEXT: s_endpgm @@ -11865,133 +15741,183 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_atomicrmw( ; GFX6-LABEL: local_cluster_one_as_seq_cst_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_cluster_one_as_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_cluster_one_as_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_cluster_one_as_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_cluster_one_as_seq_cst_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_seq_cst_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_cluster_one_as_seq_cst_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_cluster_one_as_seq_cst_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_cluster_one_as_seq_cst_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_cluster_one_as_seq_cst_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_cluster_one_as_seq_cst_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm @@ -11999,10 +15925,14 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_atomicrmw( ; GFX1250-LABEL: local_cluster_one_as_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s0 ; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX1250-NEXT: s_endpgm @@ -12016,11 +15946,13 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_ret_atomicrmw( ; GFX6-LABEL: local_cluster_one_as_acquire_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 ; GFX6-NEXT: s_mov_b32 m0, -1 @@ -12032,6 +15964,10 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_ret_atomicrmw( ; GFX7-LABEL: local_cluster_one_as_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12047,6 +15983,10 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_ret_atomicrmw( ; GFX10-WGP-LABEL: local_cluster_one_as_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -12060,6 +16000,10 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_ret_atomicrmw( ; GFX10-CU-LABEL: local_cluster_one_as_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -12073,6 +16017,10 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: local_cluster_one_as_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -12088,6 +16036,10 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -12101,6 +16053,10 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -12114,6 +16070,10 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_acquire_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 @@ -12127,6 +16087,10 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: local_cluster_one_as_acquire_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 @@ -12140,6 +16104,10 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_ret_atomicrmw( ; GFX11-WGP-LABEL: local_cluster_one_as_acquire_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -12153,6 +16121,10 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_ret_atomicrmw( ; GFX11-CU-LABEL: local_cluster_one_as_acquire_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -12166,6 +16138,10 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_ret_atomicrmw( ; GFX12-WGP-LABEL: local_cluster_one_as_acquire_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -12179,6 +16155,10 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_ret_atomicrmw( ; GFX12-CU-LABEL: local_cluster_one_as_acquire_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -12193,6 +16173,10 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_ret_atomicrmw( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 @@ -12213,11 +16197,13 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_ret_atomicrmw( ; GFX6-LABEL: local_cluster_one_as_acq_rel_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 ; GFX6-NEXT: s_mov_b32 m0, -1 @@ -12229,6 +16215,10 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_ret_atomicrmw( ; GFX7-LABEL: local_cluster_one_as_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12244,6 +16234,10 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_ret_atomicrmw( ; GFX10-WGP-LABEL: local_cluster_one_as_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -12257,6 +16251,10 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_ret_atomicrmw( ; GFX10-CU-LABEL: local_cluster_one_as_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -12270,6 +16268,10 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: local_cluster_one_as_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -12285,6 +16287,10 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -12298,6 +16304,10 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -12311,6 +16321,10 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_acq_rel_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 @@ -12324,6 +16338,10 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: local_cluster_one_as_acq_rel_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 @@ -12337,6 +16355,10 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_ret_atomicrmw( ; GFX11-WGP-LABEL: local_cluster_one_as_acq_rel_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -12350,6 +16372,10 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_ret_atomicrmw( ; GFX11-CU-LABEL: local_cluster_one_as_acq_rel_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -12363,6 +16389,10 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_ret_atomicrmw( ; GFX12-WGP-LABEL: local_cluster_one_as_acq_rel_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -12376,6 +16406,10 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-LABEL: local_cluster_one_as_acq_rel_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -12390,6 +16424,10 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_ret_atomicrmw( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 @@ -12410,11 +16448,13 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_ret_atomicrmw( ; GFX6-LABEL: local_cluster_one_as_seq_cst_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 ; GFX6-NEXT: s_mov_b32 m0, -1 @@ -12426,6 +16466,10 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_ret_atomicrmw( ; GFX7-LABEL: local_cluster_one_as_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12441,6 +16485,10 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_ret_atomicrmw( ; GFX10-WGP-LABEL: local_cluster_one_as_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -12454,6 +16502,10 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_ret_atomicrmw( ; GFX10-CU-LABEL: local_cluster_one_as_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -12467,6 +16519,10 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: local_cluster_one_as_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -12482,6 +16538,10 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -12495,6 +16555,10 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -12508,6 +16572,10 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_seq_cst_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 @@ -12521,6 +16589,10 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: local_cluster_one_as_seq_cst_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 @@ -12534,6 +16606,10 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_ret_atomicrmw( ; GFX11-WGP-LABEL: local_cluster_one_as_seq_cst_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -12547,6 +16623,10 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_ret_atomicrmw( ; GFX11-CU-LABEL: local_cluster_one_as_seq_cst_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -12560,6 +16640,10 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_ret_atomicrmw( ; GFX12-WGP-LABEL: local_cluster_one_as_seq_cst_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -12573,6 +16657,10 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-LABEL: local_cluster_one_as_seq_cst_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -12587,6 +16675,10 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_ret_atomicrmw( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 @@ -12607,12 +16699,16 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_monotonic_cmpxchg( ; GFX6-LABEL: local_cluster_one_as_monotonic_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12620,12 +16716,18 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_monotonic_cmpxchg( ; ; GFX7-LABEL: local_cluster_one_as_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12633,11 +16735,17 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_cluster_one_as_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12645,11 +16753,17 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_cluster_one_as_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12657,12 +16771,18 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_cluster_one_as_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12670,11 +16790,17 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12682,11 +16808,17 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12694,11 +16826,17 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_monotonic_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12706,11 +16844,17 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_cluster_one_as_monotonic_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12718,48 +16862,72 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_cluster_one_as_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_cluster_one_as_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_cluster_one_as_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_cluster_one_as_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -12767,12 +16935,18 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_monotonic_cmpxchg( ; GFX1250-LABEL: local_cluster_one_as_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -12787,12 +16961,16 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_monotonic_cmpxchg( ; GFX6-LABEL: local_cluster_one_as_acquire_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12800,12 +16978,18 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_monotonic_cmpxchg( ; ; GFX7-LABEL: local_cluster_one_as_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12813,11 +16997,17 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_cluster_one_as_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12825,11 +17015,17 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_cluster_one_as_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12837,12 +17033,18 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_cluster_one_as_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12850,11 +17052,17 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12862,11 +17070,17 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12874,11 +17088,17 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_acquire_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12886,11 +17106,17 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_cluster_one_as_acquire_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12898,48 +17124,72 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_cluster_one_as_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_cluster_one_as_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_cluster_one_as_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_cluster_one_as_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -12947,12 +17197,18 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_monotonic_cmpxchg( ; GFX1250-LABEL: local_cluster_one_as_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -12967,12 +17223,16 @@ define amdgpu_kernel void @local_cluster_one_as_release_monotonic_cmpxchg( ; GFX6-LABEL: local_cluster_one_as_release_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12980,12 +17240,18 @@ define amdgpu_kernel void @local_cluster_one_as_release_monotonic_cmpxchg( ; ; GFX7-LABEL: local_cluster_one_as_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12993,11 +17259,17 @@ define amdgpu_kernel void @local_cluster_one_as_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_cluster_one_as_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13005,11 +17277,17 @@ define amdgpu_kernel void @local_cluster_one_as_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_cluster_one_as_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13017,12 +17295,18 @@ define amdgpu_kernel void @local_cluster_one_as_release_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_cluster_one_as_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13030,11 +17314,17 @@ define amdgpu_kernel void @local_cluster_one_as_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13042,11 +17332,17 @@ define amdgpu_kernel void @local_cluster_one_as_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13054,11 +17350,17 @@ define amdgpu_kernel void @local_cluster_one_as_release_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_release_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13066,11 +17368,17 @@ define amdgpu_kernel void @local_cluster_one_as_release_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_cluster_one_as_release_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13078,48 +17386,72 @@ define amdgpu_kernel void @local_cluster_one_as_release_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_cluster_one_as_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_cluster_one_as_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_cluster_one_as_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_cluster_one_as_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -13127,12 +17459,18 @@ define amdgpu_kernel void @local_cluster_one_as_release_monotonic_cmpxchg( ; GFX1250-LABEL: local_cluster_one_as_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -13147,12 +17485,16 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_monotonic_cmpxchg( ; GFX6-LABEL: local_cluster_one_as_acq_rel_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13160,12 +17502,18 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX7-LABEL: local_cluster_one_as_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13173,11 +17521,17 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_cluster_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13185,11 +17539,17 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_cluster_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13197,12 +17557,18 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_cluster_one_as_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13210,11 +17576,17 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13222,11 +17594,17 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13234,11 +17612,17 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_acq_rel_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13246,11 +17630,17 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_cluster_one_as_acq_rel_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13258,48 +17648,72 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_cluster_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_cluster_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_cluster_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_cluster_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -13307,12 +17721,18 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_monotonic_cmpxchg( ; GFX1250-LABEL: local_cluster_one_as_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -13327,12 +17747,16 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_monotonic_cmpxchg( ; GFX6-LABEL: local_cluster_one_as_seq_cst_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13340,12 +17764,18 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX7-LABEL: local_cluster_one_as_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13353,11 +17783,17 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_cluster_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13365,11 +17801,17 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_cluster_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13377,12 +17819,18 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_cluster_one_as_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13390,11 +17838,17 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13402,11 +17856,17 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13414,11 +17874,17 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_seq_cst_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13426,11 +17892,17 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_cluster_one_as_seq_cst_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13438,48 +17910,72 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_cluster_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_cluster_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_cluster_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_cluster_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -13487,12 +17983,18 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_monotonic_cmpxchg( ; GFX1250-LABEL: local_cluster_one_as_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -13507,12 +18009,16 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_acquire_cmpxchg( ; GFX6-LABEL: local_cluster_one_as_monotonic_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13520,12 +18026,18 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_acquire_cmpxchg( ; ; GFX7-LABEL: local_cluster_one_as_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13533,11 +18045,17 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_cluster_one_as_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13545,11 +18063,17 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_cluster_one_as_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13557,12 +18081,18 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_cluster_one_as_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13570,11 +18100,17 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13582,11 +18118,17 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13594,11 +18136,17 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_monotonic_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13606,11 +18154,17 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_cluster_one_as_monotonic_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13618,48 +18172,72 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_cluster_one_as_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_cluster_one_as_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_cluster_one_as_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_cluster_one_as_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -13667,12 +18245,18 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_acquire_cmpxchg( ; GFX1250-LABEL: local_cluster_one_as_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -13687,12 +18271,16 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_acquire_cmpxchg( ; GFX6-LABEL: local_cluster_one_as_acquire_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13700,12 +18288,18 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_acquire_cmpxchg( ; ; GFX7-LABEL: local_cluster_one_as_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13713,11 +18307,17 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_cluster_one_as_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13725,11 +18325,17 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_cluster_one_as_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13737,12 +18343,18 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_cluster_one_as_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13750,11 +18362,17 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13762,11 +18380,17 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13774,11 +18398,17 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_acquire_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13786,11 +18416,17 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_cluster_one_as_acquire_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13798,48 +18434,72 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_cluster_one_as_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_cluster_one_as_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_cluster_one_as_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_cluster_one_as_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -13847,12 +18507,18 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_acquire_cmpxchg( ; GFX1250-LABEL: local_cluster_one_as_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -13867,12 +18533,16 @@ define amdgpu_kernel void @local_cluster_one_as_release_acquire_cmpxchg( ; GFX6-LABEL: local_cluster_one_as_release_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13880,12 +18550,18 @@ define amdgpu_kernel void @local_cluster_one_as_release_acquire_cmpxchg( ; ; GFX7-LABEL: local_cluster_one_as_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13893,11 +18569,17 @@ define amdgpu_kernel void @local_cluster_one_as_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_cluster_one_as_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13905,11 +18587,17 @@ define amdgpu_kernel void @local_cluster_one_as_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_cluster_one_as_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13917,12 +18605,18 @@ define amdgpu_kernel void @local_cluster_one_as_release_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_cluster_one_as_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13930,11 +18624,17 @@ define amdgpu_kernel void @local_cluster_one_as_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13942,11 +18642,17 @@ define amdgpu_kernel void @local_cluster_one_as_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13954,11 +18660,17 @@ define amdgpu_kernel void @local_cluster_one_as_release_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_release_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13966,11 +18678,17 @@ define amdgpu_kernel void @local_cluster_one_as_release_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_cluster_one_as_release_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13978,48 +18696,72 @@ define amdgpu_kernel void @local_cluster_one_as_release_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_cluster_one_as_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_cluster_one_as_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_cluster_one_as_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_cluster_one_as_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -14027,12 +18769,18 @@ define amdgpu_kernel void @local_cluster_one_as_release_acquire_cmpxchg( ; GFX1250-LABEL: local_cluster_one_as_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -14047,12 +18795,16 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_acquire_cmpxchg( ; GFX6-LABEL: local_cluster_one_as_acq_rel_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14060,12 +18812,18 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_acquire_cmpxchg( ; ; GFX7-LABEL: local_cluster_one_as_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14073,11 +18831,17 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_cluster_one_as_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14085,11 +18849,17 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_cluster_one_as_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14097,12 +18867,18 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_cluster_one_as_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14110,11 +18886,17 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14122,11 +18904,17 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14134,11 +18922,17 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_acq_rel_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14146,11 +18940,17 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_cluster_one_as_acq_rel_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14158,48 +18958,72 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_cluster_one_as_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_cluster_one_as_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_cluster_one_as_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_cluster_one_as_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -14207,12 +19031,18 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_acquire_cmpxchg( ; GFX1250-LABEL: local_cluster_one_as_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -14227,12 +19057,16 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_acquire_cmpxchg( ; GFX6-LABEL: local_cluster_one_as_seq_cst_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14240,12 +19074,18 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_acquire_cmpxchg( ; ; GFX7-LABEL: local_cluster_one_as_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14253,11 +19093,17 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_cluster_one_as_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14265,11 +19111,17 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_cluster_one_as_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14277,12 +19129,18 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_cluster_one_as_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14290,11 +19148,17 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14302,11 +19166,17 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14314,11 +19184,17 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_seq_cst_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14326,11 +19202,17 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_cluster_one_as_seq_cst_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14338,48 +19220,72 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_cluster_one_as_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_cluster_one_as_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_cluster_one_as_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_cluster_one_as_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -14387,12 +19293,18 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_acquire_cmpxchg( ; GFX1250-LABEL: local_cluster_one_as_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -14407,12 +19319,16 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_seq_cst_cmpxchg( ; GFX6-LABEL: local_cluster_one_as_monotonic_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14420,12 +19336,18 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_cluster_one_as_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14433,11 +19355,17 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_cluster_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14445,11 +19373,17 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_cluster_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14457,12 +19391,18 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_cluster_one_as_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14470,11 +19410,17 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14482,11 +19428,17 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14494,11 +19446,17 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_monotonic_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14506,11 +19464,17 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_cluster_one_as_monotonic_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14518,48 +19482,72 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_cluster_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_cluster_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_cluster_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_cluster_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -14567,12 +19555,18 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_seq_cst_cmpxchg( ; GFX1250-LABEL: local_cluster_one_as_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -14587,12 +19581,16 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_seq_cst_cmpxchg( ; GFX6-LABEL: local_cluster_one_as_acquire_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14600,12 +19598,18 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_cluster_one_as_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14613,11 +19617,17 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_cluster_one_as_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14625,11 +19635,17 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_cluster_one_as_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14637,12 +19653,18 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_cluster_one_as_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14650,11 +19672,17 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14662,11 +19690,17 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14674,11 +19708,17 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_acquire_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14686,11 +19726,17 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_cluster_one_as_acquire_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14698,48 +19744,72 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_cluster_one_as_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_cluster_one_as_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_cluster_one_as_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_cluster_one_as_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -14747,12 +19817,18 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_seq_cst_cmpxchg( ; GFX1250-LABEL: local_cluster_one_as_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -14767,12 +19843,16 @@ define amdgpu_kernel void @local_cluster_one_as_release_seq_cst_cmpxchg( ; GFX6-LABEL: local_cluster_one_as_release_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14780,12 +19860,18 @@ define amdgpu_kernel void @local_cluster_one_as_release_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_cluster_one_as_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14793,11 +19879,17 @@ define amdgpu_kernel void @local_cluster_one_as_release_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_cluster_one_as_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14805,11 +19897,17 @@ define amdgpu_kernel void @local_cluster_one_as_release_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_cluster_one_as_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14817,12 +19915,18 @@ define amdgpu_kernel void @local_cluster_one_as_release_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_cluster_one_as_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14830,11 +19934,17 @@ define amdgpu_kernel void @local_cluster_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14842,11 +19952,17 @@ define amdgpu_kernel void @local_cluster_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14854,11 +19970,17 @@ define amdgpu_kernel void @local_cluster_one_as_release_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_release_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14866,11 +19988,17 @@ define amdgpu_kernel void @local_cluster_one_as_release_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_cluster_one_as_release_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14878,48 +20006,72 @@ define amdgpu_kernel void @local_cluster_one_as_release_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_cluster_one_as_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_cluster_one_as_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_cluster_one_as_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_cluster_one_as_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -14927,12 +20079,18 @@ define amdgpu_kernel void @local_cluster_one_as_release_seq_cst_cmpxchg( ; GFX1250-LABEL: local_cluster_one_as_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -14947,12 +20105,16 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_seq_cst_cmpxchg( ; GFX6-LABEL: local_cluster_one_as_acq_rel_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14960,12 +20122,18 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_cluster_one_as_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14973,11 +20141,17 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_cluster_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14985,11 +20159,17 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_cluster_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14997,12 +20177,18 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_cluster_one_as_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -15010,11 +20196,17 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -15022,11 +20214,17 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -15034,11 +20232,17 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_acq_rel_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -15046,11 +20250,17 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_cluster_one_as_acq_rel_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -15058,48 +20268,72 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_cluster_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_cluster_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_cluster_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_cluster_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -15107,12 +20341,18 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_seq_cst_cmpxchg( ; GFX1250-LABEL: local_cluster_one_as_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -15127,12 +20367,16 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_seq_cst_cmpxchg( ; GFX6-LABEL: local_cluster_one_as_seq_cst_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -15140,12 +20384,18 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_cluster_one_as_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -15153,11 +20403,17 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_cluster_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -15165,11 +20421,17 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_cluster_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -15177,12 +20439,18 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_cluster_one_as_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -15190,11 +20458,17 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -15202,11 +20476,17 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -15214,11 +20494,17 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_seq_cst_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -15226,11 +20512,17 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_cluster_one_as_seq_cst_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -15238,48 +20530,72 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_cluster_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_cluster_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_cluster_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_cluster_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -15287,12 +20603,18 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_seq_cst_cmpxchg( ; GFX1250-LABEL: local_cluster_one_as_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -15307,12 +20629,16 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_cluster_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -15325,6 +20651,12 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX7-LABEL: local_cluster_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -15342,6 +20674,12 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX10-WGP-LABEL: local_cluster_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -15357,6 +20695,12 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX10-CU-LABEL: local_cluster_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -15372,6 +20716,12 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_cluster_one_as_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -15389,6 +20739,12 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15404,6 +20760,12 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15419,6 +20781,12 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15434,6 +20802,12 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_cluster_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15449,6 +20823,12 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: local_cluster_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -15464,6 +20844,12 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: local_cluster_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -15479,6 +20865,12 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_cluster_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -15494,6 +20886,12 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_cluster_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -15510,6 +20908,12 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -15534,12 +20938,16 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_cluster_one_as_acquire_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -15552,6 +20960,12 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_monotonic_ret_cmpxchg( ; GFX7-LABEL: local_cluster_one_as_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -15569,6 +20983,12 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_monotonic_ret_cmpxchg( ; GFX10-WGP-LABEL: local_cluster_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -15584,6 +21004,12 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_monotonic_ret_cmpxchg( ; GFX10-CU-LABEL: local_cluster_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -15599,6 +21025,12 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_cluster_one_as_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -15616,6 +21048,12 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15631,6 +21069,12 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15646,6 +21090,12 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_acquire_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15661,6 +21111,12 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_cluster_one_as_acquire_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15676,6 +21132,12 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: local_cluster_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -15691,6 +21153,12 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: local_cluster_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -15706,6 +21174,12 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_cluster_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -15721,6 +21195,12 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_cluster_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -15737,6 +21217,12 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -15761,12 +21247,16 @@ define amdgpu_kernel void @local_cluster_one_as_release_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_cluster_one_as_release_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -15779,6 +21269,12 @@ define amdgpu_kernel void @local_cluster_one_as_release_monotonic_ret_cmpxchg( ; GFX7-LABEL: local_cluster_one_as_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -15796,6 +21292,12 @@ define amdgpu_kernel void @local_cluster_one_as_release_monotonic_ret_cmpxchg( ; GFX10-WGP-LABEL: local_cluster_one_as_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -15811,6 +21313,12 @@ define amdgpu_kernel void @local_cluster_one_as_release_monotonic_ret_cmpxchg( ; GFX10-CU-LABEL: local_cluster_one_as_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -15826,6 +21334,12 @@ define amdgpu_kernel void @local_cluster_one_as_release_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_cluster_one_as_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -15843,6 +21357,12 @@ define amdgpu_kernel void @local_cluster_one_as_release_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15858,6 +21378,12 @@ define amdgpu_kernel void @local_cluster_one_as_release_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15873,6 +21399,12 @@ define amdgpu_kernel void @local_cluster_one_as_release_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_release_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15888,6 +21420,12 @@ define amdgpu_kernel void @local_cluster_one_as_release_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_cluster_one_as_release_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15903,6 +21441,12 @@ define amdgpu_kernel void @local_cluster_one_as_release_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: local_cluster_one_as_release_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -15918,6 +21462,12 @@ define amdgpu_kernel void @local_cluster_one_as_release_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: local_cluster_one_as_release_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -15933,6 +21483,12 @@ define amdgpu_kernel void @local_cluster_one_as_release_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_cluster_one_as_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -15948,6 +21504,12 @@ define amdgpu_kernel void @local_cluster_one_as_release_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_cluster_one_as_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -15964,6 +21526,12 @@ define amdgpu_kernel void @local_cluster_one_as_release_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -15988,12 +21556,16 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -16006,6 +21578,12 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX7-LABEL: local_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -16023,6 +21601,12 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX10-WGP-LABEL: local_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16038,6 +21622,12 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX10-CU-LABEL: local_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16053,6 +21643,12 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -16070,6 +21666,12 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16085,6 +21687,12 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16100,6 +21708,12 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16115,6 +21729,12 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16130,6 +21750,12 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: local_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16145,6 +21771,12 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: local_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16160,6 +21792,12 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -16175,6 +21813,12 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -16191,6 +21835,12 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -16215,12 +21865,16 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -16233,6 +21887,12 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX7-LABEL: local_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -16250,6 +21910,12 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX10-WGP-LABEL: local_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16265,6 +21931,12 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX10-CU-LABEL: local_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16280,6 +21952,12 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -16297,6 +21975,12 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16312,6 +21996,12 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16327,6 +22017,12 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16342,6 +22038,12 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16357,6 +22059,12 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: local_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16372,6 +22080,12 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: local_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16387,6 +22101,12 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -16402,6 +22122,12 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -16418,6 +22144,12 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -16442,12 +22174,16 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_acquire_ret_cmpxchg( ; GFX6-LABEL: local_cluster_one_as_monotonic_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -16460,6 +22196,12 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_acquire_ret_cmpxchg( ; GFX7-LABEL: local_cluster_one_as_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -16477,6 +22219,12 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: local_cluster_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16492,6 +22240,12 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_acquire_ret_cmpxchg( ; GFX10-CU-LABEL: local_cluster_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16507,6 +22261,12 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_cluster_one_as_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -16524,6 +22284,12 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16539,6 +22305,12 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16554,6 +22326,12 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_monotonic_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16569,6 +22347,12 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_cluster_one_as_monotonic_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16584,6 +22368,12 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: local_cluster_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16599,6 +22389,12 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: local_cluster_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16614,6 +22410,12 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_cluster_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -16629,6 +22431,12 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_cluster_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -16645,6 +22453,12 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -16669,12 +22483,16 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_acquire_ret_cmpxchg( ; GFX6-LABEL: local_cluster_one_as_acquire_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -16687,6 +22505,12 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_acquire_ret_cmpxchg( ; GFX7-LABEL: local_cluster_one_as_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -16704,6 +22528,12 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: local_cluster_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16719,6 +22549,12 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_acquire_ret_cmpxchg( ; GFX10-CU-LABEL: local_cluster_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16734,6 +22570,12 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_cluster_one_as_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -16751,6 +22593,12 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16766,6 +22614,12 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16781,6 +22635,12 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_acquire_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16796,6 +22656,12 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_cluster_one_as_acquire_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16811,6 +22677,12 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: local_cluster_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16826,6 +22698,12 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: local_cluster_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16841,6 +22719,12 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_cluster_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -16856,6 +22740,12 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_cluster_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -16872,6 +22762,12 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -16896,12 +22792,16 @@ define amdgpu_kernel void @local_cluster_one_as_release_acquire_ret_cmpxchg( ; GFX6-LABEL: local_cluster_one_as_release_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -16914,6 +22814,12 @@ define amdgpu_kernel void @local_cluster_one_as_release_acquire_ret_cmpxchg( ; GFX7-LABEL: local_cluster_one_as_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -16931,6 +22837,12 @@ define amdgpu_kernel void @local_cluster_one_as_release_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: local_cluster_one_as_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16946,6 +22858,12 @@ define amdgpu_kernel void @local_cluster_one_as_release_acquire_ret_cmpxchg( ; GFX10-CU-LABEL: local_cluster_one_as_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16961,6 +22879,12 @@ define amdgpu_kernel void @local_cluster_one_as_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_cluster_one_as_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -16978,6 +22902,12 @@ define amdgpu_kernel void @local_cluster_one_as_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16993,6 +22923,12 @@ define amdgpu_kernel void @local_cluster_one_as_release_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17008,6 +22944,12 @@ define amdgpu_kernel void @local_cluster_one_as_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_release_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17023,6 +22965,12 @@ define amdgpu_kernel void @local_cluster_one_as_release_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_cluster_one_as_release_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17038,6 +22986,12 @@ define amdgpu_kernel void @local_cluster_one_as_release_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: local_cluster_one_as_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17053,6 +23007,12 @@ define amdgpu_kernel void @local_cluster_one_as_release_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: local_cluster_one_as_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17068,6 +23028,12 @@ define amdgpu_kernel void @local_cluster_one_as_release_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_cluster_one_as_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -17083,6 +23049,12 @@ define amdgpu_kernel void @local_cluster_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_cluster_one_as_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -17099,6 +23071,12 @@ define amdgpu_kernel void @local_cluster_one_as_release_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -17123,12 +23101,16 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX6-LABEL: local_cluster_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -17141,6 +23123,12 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX7-LABEL: local_cluster_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -17158,6 +23146,12 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: local_cluster_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17173,6 +23167,12 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX10-CU-LABEL: local_cluster_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17188,6 +23188,12 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_cluster_one_as_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -17205,6 +23211,12 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17220,6 +23232,12 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17235,6 +23253,12 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17250,6 +23274,12 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_cluster_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17265,6 +23295,12 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: local_cluster_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17280,6 +23316,12 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: local_cluster_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17295,6 +23337,12 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_cluster_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -17310,6 +23358,12 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_cluster_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -17326,6 +23380,12 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -17350,12 +23410,16 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX6-LABEL: local_cluster_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -17368,6 +23432,12 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX7-LABEL: local_cluster_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -17385,6 +23455,12 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: local_cluster_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17400,6 +23476,12 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX10-CU-LABEL: local_cluster_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17415,6 +23497,12 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_cluster_one_as_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -17432,6 +23520,12 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17447,6 +23541,12 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17462,6 +23562,12 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17477,6 +23583,12 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_cluster_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17492,6 +23604,12 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: local_cluster_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17507,6 +23625,12 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: local_cluster_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17522,6 +23646,12 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_cluster_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -17537,6 +23667,12 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_cluster_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -17553,6 +23689,12 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -17577,12 +23719,16 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -17595,6 +23741,12 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX7-LABEL: local_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -17612,6 +23764,12 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX10-WGP-LABEL: local_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17627,6 +23785,12 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX10-CU-LABEL: local_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17642,6 +23806,12 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -17659,6 +23829,12 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17674,6 +23850,12 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17689,6 +23871,12 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17704,6 +23892,12 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17719,6 +23913,12 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: local_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17734,6 +23934,12 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: local_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17749,6 +23955,12 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -17764,6 +23976,12 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -17780,6 +23998,12 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -17804,12 +24028,16 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_cluster_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -17822,6 +24050,12 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX7-LABEL: local_cluster_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -17839,6 +24073,12 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX10-WGP-LABEL: local_cluster_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17854,6 +24094,12 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX10-CU-LABEL: local_cluster_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17869,6 +24115,12 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_cluster_one_as_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -17886,6 +24138,12 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17901,6 +24159,12 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17916,6 +24180,12 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17931,6 +24201,12 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_cluster_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17946,6 +24222,12 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: local_cluster_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17961,6 +24243,12 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: local_cluster_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17976,6 +24264,12 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_cluster_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -17991,6 +24285,12 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_cluster_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -18007,6 +24307,12 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -18031,12 +24337,16 @@ define amdgpu_kernel void @local_cluster_one_as_release_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_cluster_one_as_release_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -18049,6 +24359,12 @@ define amdgpu_kernel void @local_cluster_one_as_release_seq_cst_ret_cmpxchg( ; GFX7-LABEL: local_cluster_one_as_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -18066,6 +24382,12 @@ define amdgpu_kernel void @local_cluster_one_as_release_seq_cst_ret_cmpxchg( ; GFX10-WGP-LABEL: local_cluster_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -18081,6 +24403,12 @@ define amdgpu_kernel void @local_cluster_one_as_release_seq_cst_ret_cmpxchg( ; GFX10-CU-LABEL: local_cluster_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -18096,6 +24424,12 @@ define amdgpu_kernel void @local_cluster_one_as_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_cluster_one_as_release_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -18113,6 +24447,12 @@ define amdgpu_kernel void @local_cluster_one_as_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18128,6 +24468,12 @@ define amdgpu_kernel void @local_cluster_one_as_release_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18143,6 +24489,12 @@ define amdgpu_kernel void @local_cluster_one_as_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_release_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18158,6 +24510,12 @@ define amdgpu_kernel void @local_cluster_one_as_release_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_cluster_one_as_release_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18173,6 +24531,12 @@ define amdgpu_kernel void @local_cluster_one_as_release_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: local_cluster_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -18188,6 +24552,12 @@ define amdgpu_kernel void @local_cluster_one_as_release_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: local_cluster_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -18203,6 +24573,12 @@ define amdgpu_kernel void @local_cluster_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_cluster_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -18218,6 +24594,12 @@ define amdgpu_kernel void @local_cluster_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_cluster_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -18234,6 +24616,12 @@ define amdgpu_kernel void @local_cluster_one_as_release_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -18258,12 +24646,16 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -18276,6 +24668,12 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-LABEL: local_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -18293,6 +24691,12 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-WGP-LABEL: local_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -18308,6 +24712,12 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-CU-LABEL: local_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -18323,6 +24733,12 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -18340,6 +24756,12 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18355,6 +24777,12 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18370,6 +24798,12 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18385,6 +24819,12 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18400,6 +24840,12 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: local_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -18415,6 +24861,12 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: local_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -18430,6 +24882,12 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -18445,6 +24903,12 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -18461,6 +24925,12 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -18485,12 +24955,16 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -18503,6 +24977,12 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-LABEL: local_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -18520,6 +25000,12 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-WGP-LABEL: local_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -18535,6 +25021,12 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-CU-LABEL: local_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -18550,6 +25042,12 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -18567,6 +25065,12 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18582,6 +25086,12 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18597,6 +25107,12 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18612,6 +25128,12 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18627,6 +25149,12 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: local_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -18642,6 +25170,12 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: local_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -18657,6 +25191,12 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -18672,6 +25212,12 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -18688,6 +25234,12 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll index 9280b1d2d301..50054d9862bf 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll @@ -19,6 +19,7 @@ define amdgpu_kernel void @local_nontemporal_load_0( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr8 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -42,24 +43,30 @@ define amdgpu_kernel void @local_nontemporal_load_0( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ds_read_b32 v2, v0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_nontemporal_load_0: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: ds_read_b32 v1, v1 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -68,10 +75,13 @@ define amdgpu_kernel void @local_nontemporal_load_0( ; ; GFX10-CU-LABEL: local_nontemporal_load_0: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: ds_read_b32 v1, v1 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -81,6 +91,9 @@ define amdgpu_kernel void @local_nontemporal_load_0( ; SKIP-CACHE-INV-LABEL: local_nontemporal_load_0: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -101,10 +114,13 @@ define amdgpu_kernel void @local_nontemporal_load_0( ; ; GFX90A-NOTTGSPLIT-LABEL: local_nontemporal_load_0: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -113,10 +129,13 @@ define amdgpu_kernel void @local_nontemporal_load_0( ; ; GFX90A-TGSPLIT-LABEL: local_nontemporal_load_0: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v1 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -125,10 +144,13 @@ define amdgpu_kernel void @local_nontemporal_load_0( ; ; GFX942-NOTTGSPLIT-LABEL: local_nontemporal_load_0: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v1 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -137,10 +159,13 @@ define amdgpu_kernel void @local_nontemporal_load_0( ; ; GFX942-TGSPLIT-LABEL: local_nontemporal_load_0: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v1 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -149,10 +174,13 @@ define amdgpu_kernel void @local_nontemporal_load_0( ; ; GFX11-WGP-LABEL: local_nontemporal_load_0: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: ds_load_b32 v1, v1 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -161,10 +189,13 @@ define amdgpu_kernel void @local_nontemporal_load_0( ; ; GFX11-CU-LABEL: local_nontemporal_load_0: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: ds_load_b32 v1, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -173,38 +204,50 @@ define amdgpu_kernel void @local_nontemporal_load_0( ; ; GFX12-WGP-LABEL: local_nontemporal_load_0: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: ds_load_b32 v1, v1 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_nontemporal_load_0: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: ds_load_b32 v1, v1 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: local_nontemporal_load_0: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ds_load_b32 v1, v1 ; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(1) %out) { @@ -219,6 +262,7 @@ define amdgpu_kernel void @local_nontemporal_load_1( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr8 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -244,27 +288,33 @@ define amdgpu_kernel void @local_nontemporal_load_1( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 2 ; GFX7-NEXT: v_lshlrev_b32_e64 v0, s7, v0 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e64 v0, s[6:7], s6, v0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: ds_read_b32 v2, v0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_nontemporal_load_1: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_lshl_add_u32 v1, v1, 2, s6 ; GFX10-WGP-NEXT: ds_read_b32 v1, v1 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -274,10 +324,13 @@ define amdgpu_kernel void @local_nontemporal_load_1( ; GFX10-CU-LABEL: local_nontemporal_load_1: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_lshl_add_u32 v1, v1, 2, s6 ; GFX10-CU-NEXT: ds_read_b32 v1, v1 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -287,6 +340,9 @@ define amdgpu_kernel void @local_nontemporal_load_1( ; SKIP-CACHE-INV-LABEL: local_nontemporal_load_1: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -310,12 +366,15 @@ define amdgpu_kernel void @local_nontemporal_load_1( ; GFX90A-NOTTGSPLIT-LABEL: local_nontemporal_load_1: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s7, 0x3ff ; GFX90A-NOTTGSPLIT-NEXT: v_and_b32_e64 v1, v1, s7 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_lshl_add_u32 v1, v1, 2, s6 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -325,12 +384,15 @@ define amdgpu_kernel void @local_nontemporal_load_1( ; GFX90A-TGSPLIT-LABEL: local_nontemporal_load_1: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b32 s7, 0x3ff ; GFX90A-TGSPLIT-NEXT: v_and_b32_e64 v1, v1, s7 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_lshl_add_u32 v1, v1, 2, s6 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v1 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -340,12 +402,15 @@ define amdgpu_kernel void @local_nontemporal_load_1( ; GFX942-NOTTGSPLIT-LABEL: local_nontemporal_load_1: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 0x3ff ; GFX942-NOTTGSPLIT-NEXT: v_and_b32_e64 v1, v1, s3 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_lshl_add_u32 v1, v1, 2, s2 ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v1 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -355,12 +420,15 @@ define amdgpu_kernel void @local_nontemporal_load_1( ; GFX942-TGSPLIT-LABEL: local_nontemporal_load_1: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 0x3ff ; GFX942-TGSPLIT-NEXT: v_and_b32_e64 v1, v1, s3 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_lshl_add_u32 v1, v1, 2, s2 ; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v1 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -370,12 +438,15 @@ define amdgpu_kernel void @local_nontemporal_load_1( ; GFX11-WGP-LABEL: local_nontemporal_load_1: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_mov_b32 s3, 0x3ff ; GFX11-WGP-NEXT: v_and_b32_e64 v1, v1, s3 -; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_lshl_add_u32 v1, v1, 2, s2 ; GFX11-WGP-NEXT: ds_load_b32 v1, v1 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -385,12 +456,15 @@ define amdgpu_kernel void @local_nontemporal_load_1( ; GFX11-CU-LABEL: local_nontemporal_load_1: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_mov_b32 s3, 0x3ff ; GFX11-CU-NEXT: v_and_b32_e64 v1, v1, s3 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_lshl_add_u32 v1, v1, 2, s2 ; GFX11-CU-NEXT: ds_load_b32 v1, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -400,30 +474,38 @@ define amdgpu_kernel void @local_nontemporal_load_1( ; GFX12-WGP-LABEL: local_nontemporal_load_1: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_mov_b32 s3, 0x3ff ; GFX12-WGP-NEXT: v_and_b32_e64 v1, v1, s3 -; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_lshl_add_u32 v1, v1, 2, s2 ; GFX12-WGP-NEXT: ds_load_b32 v1, v1 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_nontemporal_load_1: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: v_mov_b32_e32 v1, v0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_mov_b32 s3, 0x3ff ; GFX12-CU-NEXT: v_and_b32_e64 v1, v1, s3 -; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_lshl_add_u32 v1, v1, 2, s2 ; GFX12-CU-NEXT: ds_load_b32 v1, v1 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; @@ -431,15 +513,19 @@ define amdgpu_kernel void @local_nontemporal_load_1( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: v_mov_b32_e32 v1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_mov_b32 s3, 0x3ff ; GFX1250-NEXT: v_and_b32_e64 v1, v1, s3 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_lshl_add_u32 v1, v1, 2, s2 ; GFX1250-NEXT: ds_load_b32 v1, v1 ; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(1) %out) { @@ -454,6 +540,8 @@ entry: define amdgpu_kernel void @local_nontemporal_store_0( ; GFX6-LABEL: local_nontemporal_store_0: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 ; GFX6-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 @@ -468,6 +556,9 @@ define amdgpu_kernel void @local_nontemporal_store_0( ; ; GFX7-LABEL: local_nontemporal_store_0: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -481,6 +572,9 @@ define amdgpu_kernel void @local_nontemporal_store_0( ; ; GFX10-WGP-LABEL: local_nontemporal_store_0: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -493,6 +587,9 @@ define amdgpu_kernel void @local_nontemporal_store_0( ; ; GFX10-CU-LABEL: local_nontemporal_store_0: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -505,6 +602,9 @@ define amdgpu_kernel void @local_nontemporal_store_0( ; ; SKIP-CACHE-INV-LABEL: local_nontemporal_store_0: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -518,6 +618,9 @@ define amdgpu_kernel void @local_nontemporal_store_0( ; ; GFX90A-NOTTGSPLIT-LABEL: local_nontemporal_store_0: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -530,6 +633,9 @@ define amdgpu_kernel void @local_nontemporal_store_0( ; ; GFX90A-TGSPLIT-LABEL: local_nontemporal_store_0: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -542,6 +648,9 @@ define amdgpu_kernel void @local_nontemporal_store_0( ; ; GFX942-NOTTGSPLIT-LABEL: local_nontemporal_store_0: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -554,6 +663,9 @@ define amdgpu_kernel void @local_nontemporal_store_0( ; ; GFX942-TGSPLIT-LABEL: local_nontemporal_store_0: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -566,6 +678,9 @@ define amdgpu_kernel void @local_nontemporal_store_0( ; ; GFX11-WGP-LABEL: local_nontemporal_store_0: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -578,6 +693,9 @@ define amdgpu_kernel void @local_nontemporal_store_0( ; ; GFX11-CU-LABEL: local_nontemporal_store_0: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -590,6 +708,9 @@ define amdgpu_kernel void @local_nontemporal_store_0( ; ; GFX12-WGP-LABEL: local_nontemporal_store_0: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -602,6 +723,9 @@ define amdgpu_kernel void @local_nontemporal_store_0( ; ; GFX12-CU-LABEL: local_nontemporal_store_0: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -615,6 +739,9 @@ define amdgpu_kernel void @local_nontemporal_store_0( ; GFX1250-LABEL: local_nontemporal_store_0: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -634,6 +761,8 @@ entry: define amdgpu_kernel void @local_nontemporal_store_1( ; GFX6-LABEL: local_nontemporal_store_1: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 ; GFX6-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 @@ -650,6 +779,9 @@ define amdgpu_kernel void @local_nontemporal_store_1( ; ; GFX7-LABEL: local_nontemporal_store_1: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -665,6 +797,9 @@ define amdgpu_kernel void @local_nontemporal_store_1( ; ; GFX10-WGP-LABEL: local_nontemporal_store_1: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -677,6 +812,9 @@ define amdgpu_kernel void @local_nontemporal_store_1( ; ; GFX10-CU-LABEL: local_nontemporal_store_1: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -689,6 +827,9 @@ define amdgpu_kernel void @local_nontemporal_store_1( ; ; SKIP-CACHE-INV-LABEL: local_nontemporal_store_1: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -704,6 +845,9 @@ define amdgpu_kernel void @local_nontemporal_store_1( ; ; GFX90A-NOTTGSPLIT-LABEL: local_nontemporal_store_1: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -718,6 +862,9 @@ define amdgpu_kernel void @local_nontemporal_store_1( ; ; GFX90A-TGSPLIT-LABEL: local_nontemporal_store_1: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -732,6 +879,9 @@ define amdgpu_kernel void @local_nontemporal_store_1( ; ; GFX942-NOTTGSPLIT-LABEL: local_nontemporal_store_1: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -746,6 +896,9 @@ define amdgpu_kernel void @local_nontemporal_store_1( ; ; GFX942-TGSPLIT-LABEL: local_nontemporal_store_1: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -760,6 +913,9 @@ define amdgpu_kernel void @local_nontemporal_store_1( ; ; GFX11-WGP-LABEL: local_nontemporal_store_1: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -774,6 +930,9 @@ define amdgpu_kernel void @local_nontemporal_store_1( ; ; GFX11-CU-LABEL: local_nontemporal_store_1: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -788,6 +947,9 @@ define amdgpu_kernel void @local_nontemporal_store_1( ; ; GFX12-WGP-LABEL: local_nontemporal_store_1: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -802,6 +964,9 @@ define amdgpu_kernel void @local_nontemporal_store_1( ; ; GFX12-CU-LABEL: local_nontemporal_store_1: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -817,6 +982,9 @@ define amdgpu_kernel void @local_nontemporal_store_1( ; GFX1250-LABEL: local_nontemporal_store_1: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -843,6 +1011,7 @@ define amdgpu_kernel void @local_nontemporal_volatile_load( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr8 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -866,24 +1035,30 @@ define amdgpu_kernel void @local_nontemporal_volatile_load( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ds_read_b32 v2, v0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_nontemporal_volatile_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: ds_read_b32 v1, v1 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -892,10 +1067,13 @@ define amdgpu_kernel void @local_nontemporal_volatile_load( ; ; GFX10-CU-LABEL: local_nontemporal_volatile_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: ds_read_b32 v1, v1 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -905,6 +1083,9 @@ define amdgpu_kernel void @local_nontemporal_volatile_load( ; SKIP-CACHE-INV-LABEL: local_nontemporal_volatile_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -925,10 +1106,13 @@ define amdgpu_kernel void @local_nontemporal_volatile_load( ; ; GFX90A-NOTTGSPLIT-LABEL: local_nontemporal_volatile_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -937,10 +1121,13 @@ define amdgpu_kernel void @local_nontemporal_volatile_load( ; ; GFX90A-TGSPLIT-LABEL: local_nontemporal_volatile_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v1 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -949,10 +1136,13 @@ define amdgpu_kernel void @local_nontemporal_volatile_load( ; ; GFX942-NOTTGSPLIT-LABEL: local_nontemporal_volatile_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v1 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -961,10 +1151,13 @@ define amdgpu_kernel void @local_nontemporal_volatile_load( ; ; GFX942-TGSPLIT-LABEL: local_nontemporal_volatile_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v1 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -973,10 +1166,13 @@ define amdgpu_kernel void @local_nontemporal_volatile_load( ; ; GFX11-WGP-LABEL: local_nontemporal_volatile_load: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: ds_load_b32 v1, v1 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -985,10 +1181,13 @@ define amdgpu_kernel void @local_nontemporal_volatile_load( ; ; GFX11-CU-LABEL: local_nontemporal_volatile_load: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: ds_load_b32 v1, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -997,38 +1196,50 @@ define amdgpu_kernel void @local_nontemporal_volatile_load( ; ; GFX12-WGP-LABEL: local_nontemporal_volatile_load: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: ds_load_b32 v1, v1 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_nontemporal_volatile_load: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: ds_load_b32 v1, v1 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: local_nontemporal_volatile_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ds_load_b32 v1, v1 ; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(1) %out) { diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll index e636a65bf47e..e5543529fc37 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll @@ -33,135 +33,169 @@ define amdgpu_kernel void @local_singlethread_unordered_load( ; ; GFX7-LABEL: local_singlethread_unordered_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: ds_read_b32 v1, v0 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_singlethread_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_singlethread_unordered_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_unordered_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_unordered_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_singlethread_unordered_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_singlethread_unordered_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_singlethread_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: ds_load_b32 v1, v0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: ds_store_b32 v0, v1 @@ -169,11 +203,15 @@ define amdgpu_kernel void @local_singlethread_unordered_load( ; ; GFX12-CU-LABEL: local_singlethread_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 @@ -182,11 +220,15 @@ define amdgpu_kernel void @local_singlethread_unordered_load( ; GFX1250-LABEL: local_singlethread_unordered_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: ds_store_b32 v0, v1 @@ -217,135 +259,169 @@ define amdgpu_kernel void @local_singlethread_monotonic_load( ; ; GFX7-LABEL: local_singlethread_monotonic_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: ds_read_b32 v1, v0 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_singlethread_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_singlethread_monotonic_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_monotonic_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_monotonic_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_singlethread_monotonic_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_singlethread_monotonic_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_singlethread_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: ds_load_b32 v1, v0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: ds_store_b32 v0, v1 @@ -353,11 +429,15 @@ define amdgpu_kernel void @local_singlethread_monotonic_load( ; ; GFX12-CU-LABEL: local_singlethread_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 @@ -366,11 +446,15 @@ define amdgpu_kernel void @local_singlethread_monotonic_load( ; GFX1250-LABEL: local_singlethread_monotonic_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: ds_store_b32 v0, v1 @@ -401,135 +485,169 @@ define amdgpu_kernel void @local_singlethread_acquire_load( ; ; GFX7-LABEL: local_singlethread_acquire_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: ds_read_b32 v1, v0 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_singlethread_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_singlethread_acquire_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_acquire_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_acquire_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_singlethread_acquire_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_singlethread_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_singlethread_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: ds_load_b32 v1, v0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: ds_store_b32 v0, v1 @@ -537,11 +655,15 @@ define amdgpu_kernel void @local_singlethread_acquire_load( ; ; GFX12-CU-LABEL: local_singlethread_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 @@ -550,11 +672,15 @@ define amdgpu_kernel void @local_singlethread_acquire_load( ; GFX1250-LABEL: local_singlethread_acquire_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: ds_store_b32 v0, v1 @@ -585,135 +711,169 @@ define amdgpu_kernel void @local_singlethread_seq_cst_load( ; ; GFX7-LABEL: local_singlethread_seq_cst_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: ds_read_b32 v1, v0 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_singlethread_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_singlethread_seq_cst_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_seq_cst_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_singlethread_seq_cst_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_singlethread_seq_cst_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_singlethread_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: ds_load_b32 v1, v0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: ds_store_b32 v0, v1 @@ -721,11 +881,15 @@ define amdgpu_kernel void @local_singlethread_seq_cst_load( ; ; GFX12-CU-LABEL: local_singlethread_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 @@ -734,11 +898,15 @@ define amdgpu_kernel void @local_singlethread_seq_cst_load( ; GFX1250-LABEL: local_singlethread_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: ds_store_b32 v0, v1 @@ -753,12 +921,14 @@ entry: define amdgpu_kernel void @local_singlethread_unordered_store( ; GFX6-LABEL: local_singlethread_unordered_store: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm @@ -766,6 +936,10 @@ define amdgpu_kernel void @local_singlethread_unordered_store( ; GFX7-LABEL: local_singlethread_unordered_store: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -777,6 +951,10 @@ define amdgpu_kernel void @local_singlethread_unordered_store( ; GFX10-WGP-LABEL: local_singlethread_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 @@ -787,6 +965,10 @@ define amdgpu_kernel void @local_singlethread_unordered_store( ; GFX10-CU-LABEL: local_singlethread_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 @@ -797,6 +979,10 @@ define amdgpu_kernel void @local_singlethread_unordered_store( ; SKIP-CACHE-INV-LABEL: local_singlethread_unordered_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -808,6 +994,10 @@ define amdgpu_kernel void @local_singlethread_unordered_store( ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -818,6 +1008,10 @@ define amdgpu_kernel void @local_singlethread_unordered_store( ; GFX90A-TGSPLIT-LABEL: local_singlethread_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -828,6 +1022,10 @@ define amdgpu_kernel void @local_singlethread_unordered_store( ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_unordered_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -838,6 +1036,10 @@ define amdgpu_kernel void @local_singlethread_unordered_store( ; GFX942-TGSPLIT-LABEL: local_singlethread_unordered_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -848,6 +1050,10 @@ define amdgpu_kernel void @local_singlethread_unordered_store( ; GFX11-WGP-LABEL: local_singlethread_unordered_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -858,6 +1064,10 @@ define amdgpu_kernel void @local_singlethread_unordered_store( ; GFX11-CU-LABEL: local_singlethread_unordered_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -868,6 +1078,10 @@ define amdgpu_kernel void @local_singlethread_unordered_store( ; GFX12-WGP-LABEL: local_singlethread_unordered_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -878,6 +1092,10 @@ define amdgpu_kernel void @local_singlethread_unordered_store( ; GFX12-CU-LABEL: local_singlethread_unordered_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -889,6 +1107,10 @@ define amdgpu_kernel void @local_singlethread_unordered_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 @@ -904,12 +1126,14 @@ entry: define amdgpu_kernel void @local_singlethread_monotonic_store( ; GFX6-LABEL: local_singlethread_monotonic_store: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm @@ -917,6 +1141,10 @@ define amdgpu_kernel void @local_singlethread_monotonic_store( ; GFX7-LABEL: local_singlethread_monotonic_store: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -928,6 +1156,10 @@ define amdgpu_kernel void @local_singlethread_monotonic_store( ; GFX10-WGP-LABEL: local_singlethread_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 @@ -938,6 +1170,10 @@ define amdgpu_kernel void @local_singlethread_monotonic_store( ; GFX10-CU-LABEL: local_singlethread_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 @@ -948,6 +1184,10 @@ define amdgpu_kernel void @local_singlethread_monotonic_store( ; SKIP-CACHE-INV-LABEL: local_singlethread_monotonic_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -959,6 +1199,10 @@ define amdgpu_kernel void @local_singlethread_monotonic_store( ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -969,6 +1213,10 @@ define amdgpu_kernel void @local_singlethread_monotonic_store( ; GFX90A-TGSPLIT-LABEL: local_singlethread_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -979,6 +1227,10 @@ define amdgpu_kernel void @local_singlethread_monotonic_store( ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_monotonic_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -989,6 +1241,10 @@ define amdgpu_kernel void @local_singlethread_monotonic_store( ; GFX942-TGSPLIT-LABEL: local_singlethread_monotonic_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -999,6 +1255,10 @@ define amdgpu_kernel void @local_singlethread_monotonic_store( ; GFX11-WGP-LABEL: local_singlethread_monotonic_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -1009,6 +1269,10 @@ define amdgpu_kernel void @local_singlethread_monotonic_store( ; GFX11-CU-LABEL: local_singlethread_monotonic_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -1019,6 +1283,10 @@ define amdgpu_kernel void @local_singlethread_monotonic_store( ; GFX12-WGP-LABEL: local_singlethread_monotonic_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -1029,6 +1297,10 @@ define amdgpu_kernel void @local_singlethread_monotonic_store( ; GFX12-CU-LABEL: local_singlethread_monotonic_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -1040,6 +1312,10 @@ define amdgpu_kernel void @local_singlethread_monotonic_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 @@ -1055,12 +1331,14 @@ entry: define amdgpu_kernel void @local_singlethread_release_store( ; GFX6-LABEL: local_singlethread_release_store: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm @@ -1068,6 +1346,10 @@ define amdgpu_kernel void @local_singlethread_release_store( ; GFX7-LABEL: local_singlethread_release_store: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1079,6 +1361,10 @@ define amdgpu_kernel void @local_singlethread_release_store( ; GFX10-WGP-LABEL: local_singlethread_release_store: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 @@ -1089,6 +1375,10 @@ define amdgpu_kernel void @local_singlethread_release_store( ; GFX10-CU-LABEL: local_singlethread_release_store: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 @@ -1099,6 +1389,10 @@ define amdgpu_kernel void @local_singlethread_release_store( ; SKIP-CACHE-INV-LABEL: local_singlethread_release_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -1110,6 +1404,10 @@ define amdgpu_kernel void @local_singlethread_release_store( ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -1120,6 +1418,10 @@ define amdgpu_kernel void @local_singlethread_release_store( ; GFX90A-TGSPLIT-LABEL: local_singlethread_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -1130,6 +1432,10 @@ define amdgpu_kernel void @local_singlethread_release_store( ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_release_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -1140,6 +1446,10 @@ define amdgpu_kernel void @local_singlethread_release_store( ; GFX942-TGSPLIT-LABEL: local_singlethread_release_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -1150,6 +1460,10 @@ define amdgpu_kernel void @local_singlethread_release_store( ; GFX11-WGP-LABEL: local_singlethread_release_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -1160,6 +1474,10 @@ define amdgpu_kernel void @local_singlethread_release_store( ; GFX11-CU-LABEL: local_singlethread_release_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -1170,6 +1488,10 @@ define amdgpu_kernel void @local_singlethread_release_store( ; GFX12-WGP-LABEL: local_singlethread_release_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -1180,6 +1502,10 @@ define amdgpu_kernel void @local_singlethread_release_store( ; GFX12-CU-LABEL: local_singlethread_release_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -1191,6 +1517,10 @@ define amdgpu_kernel void @local_singlethread_release_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 @@ -1206,12 +1536,14 @@ entry: define amdgpu_kernel void @local_singlethread_seq_cst_store( ; GFX6-LABEL: local_singlethread_seq_cst_store: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm @@ -1219,6 +1551,10 @@ define amdgpu_kernel void @local_singlethread_seq_cst_store( ; GFX7-LABEL: local_singlethread_seq_cst_store: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1230,6 +1566,10 @@ define amdgpu_kernel void @local_singlethread_seq_cst_store( ; GFX10-WGP-LABEL: local_singlethread_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 @@ -1240,6 +1580,10 @@ define amdgpu_kernel void @local_singlethread_seq_cst_store( ; GFX10-CU-LABEL: local_singlethread_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 @@ -1250,6 +1594,10 @@ define amdgpu_kernel void @local_singlethread_seq_cst_store( ; SKIP-CACHE-INV-LABEL: local_singlethread_seq_cst_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -1261,6 +1609,10 @@ define amdgpu_kernel void @local_singlethread_seq_cst_store( ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -1271,6 +1623,10 @@ define amdgpu_kernel void @local_singlethread_seq_cst_store( ; GFX90A-TGSPLIT-LABEL: local_singlethread_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -1281,6 +1637,10 @@ define amdgpu_kernel void @local_singlethread_seq_cst_store( ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -1291,6 +1651,10 @@ define amdgpu_kernel void @local_singlethread_seq_cst_store( ; GFX942-TGSPLIT-LABEL: local_singlethread_seq_cst_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -1301,6 +1665,10 @@ define amdgpu_kernel void @local_singlethread_seq_cst_store( ; GFX11-WGP-LABEL: local_singlethread_seq_cst_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -1311,6 +1679,10 @@ define amdgpu_kernel void @local_singlethread_seq_cst_store( ; GFX11-CU-LABEL: local_singlethread_seq_cst_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -1321,6 +1693,10 @@ define amdgpu_kernel void @local_singlethread_seq_cst_store( ; GFX12-WGP-LABEL: local_singlethread_seq_cst_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -1331,6 +1707,10 @@ define amdgpu_kernel void @local_singlethread_seq_cst_store( ; GFX12-CU-LABEL: local_singlethread_seq_cst_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -1342,6 +1722,10 @@ define amdgpu_kernel void @local_singlethread_seq_cst_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 @@ -1358,133 +1742,183 @@ define amdgpu_kernel void @local_singlethread_monotonic_atomicrmw( ; GFX6-LABEL: local_singlethread_monotonic_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_singlethread_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_singlethread_monotonic_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_monotonic_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_monotonic_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_singlethread_monotonic_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_singlethread_monotonic_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_singlethread_monotonic_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_singlethread_monotonic_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm @@ -1492,10 +1926,14 @@ define amdgpu_kernel void @local_singlethread_monotonic_atomicrmw( ; GFX1250-LABEL: local_singlethread_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s0 ; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX1250-NEXT: s_endpgm @@ -1509,133 +1947,183 @@ define amdgpu_kernel void @local_singlethread_acquire_atomicrmw( ; GFX6-LABEL: local_singlethread_acquire_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_singlethread_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_singlethread_acquire_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_acquire_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_acquire_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_singlethread_acquire_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_singlethread_acquire_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_singlethread_acquire_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_singlethread_acquire_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm @@ -1643,10 +2131,14 @@ define amdgpu_kernel void @local_singlethread_acquire_atomicrmw( ; GFX1250-LABEL: local_singlethread_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s0 ; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX1250-NEXT: s_endpgm @@ -1660,133 +2152,183 @@ define amdgpu_kernel void @local_singlethread_release_atomicrmw( ; GFX6-LABEL: local_singlethread_release_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_singlethread_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_singlethread_release_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_release_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_release_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_singlethread_release_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_singlethread_release_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_singlethread_release_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_singlethread_release_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm @@ -1794,10 +2336,14 @@ define amdgpu_kernel void @local_singlethread_release_atomicrmw( ; GFX1250-LABEL: local_singlethread_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s0 ; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX1250-NEXT: s_endpgm @@ -1811,133 +2357,183 @@ define amdgpu_kernel void @local_singlethread_acq_rel_atomicrmw( ; GFX6-LABEL: local_singlethread_acq_rel_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_singlethread_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_singlethread_acq_rel_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_acq_rel_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_singlethread_acq_rel_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_singlethread_acq_rel_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_singlethread_acq_rel_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_singlethread_acq_rel_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm @@ -1945,10 +2541,14 @@ define amdgpu_kernel void @local_singlethread_acq_rel_atomicrmw( ; GFX1250-LABEL: local_singlethread_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s0 ; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX1250-NEXT: s_endpgm @@ -1962,133 +2562,183 @@ define amdgpu_kernel void @local_singlethread_seq_cst_atomicrmw( ; GFX6-LABEL: local_singlethread_seq_cst_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_singlethread_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_singlethread_seq_cst_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_seq_cst_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_singlethread_seq_cst_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_singlethread_seq_cst_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_singlethread_seq_cst_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_singlethread_seq_cst_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm @@ -2096,10 +2746,14 @@ define amdgpu_kernel void @local_singlethread_seq_cst_atomicrmw( ; GFX1250-LABEL: local_singlethread_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s0 ; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX1250-NEXT: s_endpgm @@ -2113,11 +2767,13 @@ define amdgpu_kernel void @local_singlethread_acquire_ret_atomicrmw( ; GFX6-LABEL: local_singlethread_acquire_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 ; GFX6-NEXT: s_mov_b32 m0, -1 @@ -2129,6 +2785,10 @@ define amdgpu_kernel void @local_singlethread_acquire_ret_atomicrmw( ; GFX7-LABEL: local_singlethread_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2144,6 +2804,10 @@ define amdgpu_kernel void @local_singlethread_acquire_ret_atomicrmw( ; GFX10-WGP-LABEL: local_singlethread_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -2157,6 +2821,10 @@ define amdgpu_kernel void @local_singlethread_acquire_ret_atomicrmw( ; GFX10-CU-LABEL: local_singlethread_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -2170,6 +2838,10 @@ define amdgpu_kernel void @local_singlethread_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: local_singlethread_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -2185,6 +2857,10 @@ define amdgpu_kernel void @local_singlethread_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -2198,6 +2874,10 @@ define amdgpu_kernel void @local_singlethread_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-LABEL: local_singlethread_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -2211,6 +2891,10 @@ define amdgpu_kernel void @local_singlethread_acquire_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_acquire_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 @@ -2224,6 +2908,10 @@ define amdgpu_kernel void @local_singlethread_acquire_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: local_singlethread_acquire_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 @@ -2237,6 +2925,10 @@ define amdgpu_kernel void @local_singlethread_acquire_ret_atomicrmw( ; GFX11-WGP-LABEL: local_singlethread_acquire_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -2250,6 +2942,10 @@ define amdgpu_kernel void @local_singlethread_acquire_ret_atomicrmw( ; GFX11-CU-LABEL: local_singlethread_acquire_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2263,6 +2959,10 @@ define amdgpu_kernel void @local_singlethread_acquire_ret_atomicrmw( ; GFX12-WGP-LABEL: local_singlethread_acquire_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -2276,6 +2976,10 @@ define amdgpu_kernel void @local_singlethread_acquire_ret_atomicrmw( ; GFX12-CU-LABEL: local_singlethread_acquire_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2290,6 +2994,10 @@ define amdgpu_kernel void @local_singlethread_acquire_ret_atomicrmw( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 @@ -2310,11 +3018,13 @@ define amdgpu_kernel void @local_singlethread_acq_rel_ret_atomicrmw( ; GFX6-LABEL: local_singlethread_acq_rel_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 ; GFX6-NEXT: s_mov_b32 m0, -1 @@ -2326,6 +3036,10 @@ define amdgpu_kernel void @local_singlethread_acq_rel_ret_atomicrmw( ; GFX7-LABEL: local_singlethread_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2341,6 +3055,10 @@ define amdgpu_kernel void @local_singlethread_acq_rel_ret_atomicrmw( ; GFX10-WGP-LABEL: local_singlethread_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -2354,6 +3072,10 @@ define amdgpu_kernel void @local_singlethread_acq_rel_ret_atomicrmw( ; GFX10-CU-LABEL: local_singlethread_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -2367,6 +3089,10 @@ define amdgpu_kernel void @local_singlethread_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: local_singlethread_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -2382,6 +3108,10 @@ define amdgpu_kernel void @local_singlethread_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -2395,6 +3125,10 @@ define amdgpu_kernel void @local_singlethread_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-LABEL: local_singlethread_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -2408,6 +3142,10 @@ define amdgpu_kernel void @local_singlethread_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 @@ -2421,6 +3159,10 @@ define amdgpu_kernel void @local_singlethread_acq_rel_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: local_singlethread_acq_rel_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 @@ -2434,6 +3176,10 @@ define amdgpu_kernel void @local_singlethread_acq_rel_ret_atomicrmw( ; GFX11-WGP-LABEL: local_singlethread_acq_rel_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -2447,6 +3193,10 @@ define amdgpu_kernel void @local_singlethread_acq_rel_ret_atomicrmw( ; GFX11-CU-LABEL: local_singlethread_acq_rel_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2460,6 +3210,10 @@ define amdgpu_kernel void @local_singlethread_acq_rel_ret_atomicrmw( ; GFX12-WGP-LABEL: local_singlethread_acq_rel_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -2473,6 +3227,10 @@ define amdgpu_kernel void @local_singlethread_acq_rel_ret_atomicrmw( ; GFX12-CU-LABEL: local_singlethread_acq_rel_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2487,6 +3245,10 @@ define amdgpu_kernel void @local_singlethread_acq_rel_ret_atomicrmw( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 @@ -2507,11 +3269,13 @@ define amdgpu_kernel void @local_singlethread_seq_cst_ret_atomicrmw( ; GFX6-LABEL: local_singlethread_seq_cst_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 ; GFX6-NEXT: s_mov_b32 m0, -1 @@ -2523,6 +3287,10 @@ define amdgpu_kernel void @local_singlethread_seq_cst_ret_atomicrmw( ; GFX7-LABEL: local_singlethread_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2538,6 +3306,10 @@ define amdgpu_kernel void @local_singlethread_seq_cst_ret_atomicrmw( ; GFX10-WGP-LABEL: local_singlethread_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -2551,6 +3323,10 @@ define amdgpu_kernel void @local_singlethread_seq_cst_ret_atomicrmw( ; GFX10-CU-LABEL: local_singlethread_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -2564,6 +3340,10 @@ define amdgpu_kernel void @local_singlethread_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: local_singlethread_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -2579,6 +3359,10 @@ define amdgpu_kernel void @local_singlethread_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -2592,6 +3376,10 @@ define amdgpu_kernel void @local_singlethread_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-LABEL: local_singlethread_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -2605,6 +3393,10 @@ define amdgpu_kernel void @local_singlethread_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 @@ -2618,6 +3410,10 @@ define amdgpu_kernel void @local_singlethread_seq_cst_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: local_singlethread_seq_cst_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 @@ -2631,6 +3427,10 @@ define amdgpu_kernel void @local_singlethread_seq_cst_ret_atomicrmw( ; GFX11-WGP-LABEL: local_singlethread_seq_cst_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -2644,6 +3444,10 @@ define amdgpu_kernel void @local_singlethread_seq_cst_ret_atomicrmw( ; GFX11-CU-LABEL: local_singlethread_seq_cst_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2657,6 +3461,10 @@ define amdgpu_kernel void @local_singlethread_seq_cst_ret_atomicrmw( ; GFX12-WGP-LABEL: local_singlethread_seq_cst_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -2670,6 +3478,10 @@ define amdgpu_kernel void @local_singlethread_seq_cst_ret_atomicrmw( ; GFX12-CU-LABEL: local_singlethread_seq_cst_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2684,6 +3496,10 @@ define amdgpu_kernel void @local_singlethread_seq_cst_ret_atomicrmw( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 @@ -2704,12 +3520,16 @@ define amdgpu_kernel void @local_singlethread_monotonic_monotonic_cmpxchg( ; GFX6-LABEL: local_singlethread_monotonic_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -2717,12 +3537,18 @@ define amdgpu_kernel void @local_singlethread_monotonic_monotonic_cmpxchg( ; ; GFX7-LABEL: local_singlethread_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -2730,11 +3556,17 @@ define amdgpu_kernel void @local_singlethread_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_singlethread_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -2742,11 +3574,17 @@ define amdgpu_kernel void @local_singlethread_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_singlethread_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -2754,12 +3592,18 @@ define amdgpu_kernel void @local_singlethread_monotonic_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -2767,11 +3611,17 @@ define amdgpu_kernel void @local_singlethread_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -2779,11 +3629,17 @@ define amdgpu_kernel void @local_singlethread_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -2791,11 +3647,17 @@ define amdgpu_kernel void @local_singlethread_monotonic_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_monotonic_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -2803,11 +3665,17 @@ define amdgpu_kernel void @local_singlethread_monotonic_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_singlethread_monotonic_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -2815,48 +3683,72 @@ define amdgpu_kernel void @local_singlethread_monotonic_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_singlethread_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_singlethread_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_singlethread_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_singlethread_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -2864,12 +3756,18 @@ define amdgpu_kernel void @local_singlethread_monotonic_monotonic_cmpxchg( ; GFX1250-LABEL: local_singlethread_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -2884,12 +3782,16 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_cmpxchg( ; GFX6-LABEL: local_singlethread_acquire_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -2897,12 +3799,18 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_cmpxchg( ; ; GFX7-LABEL: local_singlethread_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -2910,11 +3818,17 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_singlethread_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -2922,11 +3836,17 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_singlethread_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -2934,12 +3854,18 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -2947,11 +3873,17 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -2959,11 +3891,17 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -2971,11 +3909,17 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_acquire_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -2983,11 +3927,17 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_singlethread_acquire_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -2995,48 +3945,72 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_singlethread_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_singlethread_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_singlethread_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_singlethread_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -3044,12 +4018,18 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_cmpxchg( ; GFX1250-LABEL: local_singlethread_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -3064,12 +4044,16 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_cmpxchg( ; GFX6-LABEL: local_singlethread_release_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3077,12 +4061,18 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_cmpxchg( ; ; GFX7-LABEL: local_singlethread_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3090,11 +4080,17 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_singlethread_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3102,11 +4098,17 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_singlethread_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3114,12 +4116,18 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3127,11 +4135,17 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3139,11 +4153,17 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3151,11 +4171,17 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_release_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3163,11 +4189,17 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_singlethread_release_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3175,48 +4207,72 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_singlethread_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_singlethread_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_singlethread_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_singlethread_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -3224,12 +4280,18 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_cmpxchg( ; GFX1250-LABEL: local_singlethread_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -3244,12 +4306,16 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_cmpxchg( ; GFX6-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3257,12 +4323,18 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_cmpxchg( ; ; GFX7-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3270,11 +4342,17 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3282,11 +4360,17 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3294,12 +4378,18 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3307,11 +4397,17 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3319,11 +4415,17 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3331,11 +4433,17 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3343,11 +4451,17 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3355,48 +4469,72 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -3404,12 +4542,18 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_cmpxchg( ; GFX1250-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -3424,12 +4568,16 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_cmpxchg( ; GFX6-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3437,12 +4585,18 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_cmpxchg( ; ; GFX7-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3450,11 +4604,17 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3462,11 +4622,17 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3474,12 +4640,18 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3487,11 +4659,17 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3499,11 +4677,17 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3511,11 +4695,17 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3523,11 +4713,17 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3535,48 +4731,72 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -3584,12 +4804,18 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_cmpxchg( ; GFX1250-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -3604,12 +4830,16 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_cmpxchg( ; GFX6-LABEL: local_singlethread_monotonic_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3617,12 +4847,18 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_cmpxchg( ; ; GFX7-LABEL: local_singlethread_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3630,11 +4866,17 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_singlethread_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3642,11 +4884,17 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_singlethread_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3654,12 +4902,18 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3667,11 +4921,17 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3679,11 +4939,17 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3691,11 +4957,17 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_monotonic_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3703,11 +4975,17 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_singlethread_monotonic_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3715,48 +4993,72 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_singlethread_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_singlethread_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_singlethread_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_singlethread_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -3764,12 +5066,18 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_cmpxchg( ; GFX1250-LABEL: local_singlethread_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -3784,12 +5092,16 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_cmpxchg( ; GFX6-LABEL: local_singlethread_acquire_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3797,12 +5109,18 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_cmpxchg( ; ; GFX7-LABEL: local_singlethread_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3810,11 +5128,17 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_singlethread_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3822,11 +5146,17 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_singlethread_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3834,12 +5164,18 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3847,11 +5183,17 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3859,11 +5201,17 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3871,11 +5219,17 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_acquire_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3883,11 +5237,17 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_singlethread_acquire_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3895,48 +5255,72 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_singlethread_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_singlethread_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_singlethread_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_singlethread_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -3944,12 +5328,18 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_cmpxchg( ; GFX1250-LABEL: local_singlethread_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -3964,12 +5354,16 @@ define amdgpu_kernel void @local_singlethread_release_acquire_cmpxchg( ; GFX6-LABEL: local_singlethread_release_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3977,12 +5371,18 @@ define amdgpu_kernel void @local_singlethread_release_acquire_cmpxchg( ; ; GFX7-LABEL: local_singlethread_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3990,11 +5390,17 @@ define amdgpu_kernel void @local_singlethread_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_singlethread_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4002,11 +5408,17 @@ define amdgpu_kernel void @local_singlethread_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_singlethread_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4014,12 +5426,18 @@ define amdgpu_kernel void @local_singlethread_release_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4027,11 +5445,17 @@ define amdgpu_kernel void @local_singlethread_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4039,11 +5463,17 @@ define amdgpu_kernel void @local_singlethread_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4051,11 +5481,17 @@ define amdgpu_kernel void @local_singlethread_release_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_release_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4063,11 +5499,17 @@ define amdgpu_kernel void @local_singlethread_release_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_singlethread_release_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4075,48 +5517,72 @@ define amdgpu_kernel void @local_singlethread_release_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_singlethread_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_singlethread_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_singlethread_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_singlethread_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -4124,12 +5590,18 @@ define amdgpu_kernel void @local_singlethread_release_acquire_cmpxchg( ; GFX1250-LABEL: local_singlethread_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -4144,12 +5616,16 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_cmpxchg( ; GFX6-LABEL: local_singlethread_acq_rel_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4157,12 +5633,18 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_cmpxchg( ; ; GFX7-LABEL: local_singlethread_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4170,11 +5652,17 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_singlethread_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4182,11 +5670,17 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_singlethread_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4194,12 +5688,18 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4207,11 +5707,17 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4219,11 +5725,17 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4231,11 +5743,17 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4243,11 +5761,17 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_singlethread_acq_rel_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4255,48 +5779,72 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_singlethread_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_singlethread_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_singlethread_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_singlethread_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -4304,12 +5852,18 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_cmpxchg( ; GFX1250-LABEL: local_singlethread_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -4324,12 +5878,16 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_cmpxchg( ; GFX6-LABEL: local_singlethread_seq_cst_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4337,12 +5895,18 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_cmpxchg( ; ; GFX7-LABEL: local_singlethread_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4350,11 +5914,17 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_singlethread_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4362,11 +5932,17 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_singlethread_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4374,12 +5950,18 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4387,11 +5969,17 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4399,11 +5987,17 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4411,11 +6005,17 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4423,11 +6023,17 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_singlethread_seq_cst_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4435,48 +6041,72 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_singlethread_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_singlethread_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_singlethread_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_singlethread_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -4484,12 +6114,18 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_cmpxchg( ; GFX1250-LABEL: local_singlethread_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -4504,12 +6140,16 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_cmpxchg( ; GFX6-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4517,12 +6157,18 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4530,11 +6176,17 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4542,11 +6194,17 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4554,12 +6212,18 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4567,11 +6231,17 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4579,11 +6249,17 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4591,11 +6267,17 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4603,11 +6285,17 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4615,48 +6303,72 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -4664,12 +6376,18 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_cmpxchg( ; GFX1250-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -4684,12 +6402,16 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_cmpxchg( ; GFX6-LABEL: local_singlethread_acquire_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4697,12 +6419,18 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_singlethread_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4710,11 +6438,17 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_singlethread_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4722,11 +6456,17 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_singlethread_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4734,12 +6474,18 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4747,11 +6493,17 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4759,11 +6511,17 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4771,11 +6529,17 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_acquire_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4783,11 +6547,17 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_singlethread_acquire_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4795,48 +6565,72 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_singlethread_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_singlethread_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_singlethread_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_singlethread_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -4844,12 +6638,18 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_cmpxchg( ; GFX1250-LABEL: local_singlethread_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -4864,12 +6664,16 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_cmpxchg( ; GFX6-LABEL: local_singlethread_release_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4877,12 +6681,18 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_singlethread_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4890,11 +6700,17 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_singlethread_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4902,11 +6718,17 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_singlethread_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4914,12 +6736,18 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4927,11 +6755,17 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4939,11 +6773,17 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4951,11 +6791,17 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_release_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4963,11 +6809,17 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_singlethread_release_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4975,48 +6827,72 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_singlethread_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_singlethread_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_singlethread_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_singlethread_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -5024,12 +6900,18 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_cmpxchg( ; GFX1250-LABEL: local_singlethread_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -5044,12 +6926,16 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX6-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -5057,12 +6943,18 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -5070,11 +6962,17 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -5082,11 +6980,17 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -5094,12 +6998,18 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -5107,11 +7017,17 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -5119,11 +7035,17 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -5131,11 +7053,17 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -5143,11 +7071,17 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -5155,48 +7089,72 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -5204,12 +7162,18 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX1250-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -5224,12 +7188,16 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX6-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -5237,12 +7205,18 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -5250,11 +7224,17 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -5262,11 +7242,17 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -5274,12 +7260,18 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -5287,11 +7279,17 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -5299,11 +7297,17 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -5311,11 +7315,17 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -5323,11 +7333,17 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -5335,48 +7351,72 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -5384,12 +7424,18 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX1250-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -5404,12 +7450,16 @@ define amdgpu_kernel void @local_singlethread_monotonic_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -5422,6 +7472,12 @@ define amdgpu_kernel void @local_singlethread_monotonic_monotonic_ret_cmpxchg( ; GFX7-LABEL: local_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -5439,6 +7495,12 @@ define amdgpu_kernel void @local_singlethread_monotonic_monotonic_ret_cmpxchg( ; GFX10-WGP-LABEL: local_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5454,6 +7516,12 @@ define amdgpu_kernel void @local_singlethread_monotonic_monotonic_ret_cmpxchg( ; GFX10-CU-LABEL: local_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5469,6 +7537,12 @@ define amdgpu_kernel void @local_singlethread_monotonic_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_singlethread_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -5486,6 +7560,12 @@ define amdgpu_kernel void @local_singlethread_monotonic_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5501,6 +7581,12 @@ define amdgpu_kernel void @local_singlethread_monotonic_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5516,6 +7602,12 @@ define amdgpu_kernel void @local_singlethread_monotonic_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5531,6 +7623,12 @@ define amdgpu_kernel void @local_singlethread_monotonic_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5546,6 +7644,12 @@ define amdgpu_kernel void @local_singlethread_monotonic_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: local_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5561,6 +7665,12 @@ define amdgpu_kernel void @local_singlethread_monotonic_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: local_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5576,6 +7686,12 @@ define amdgpu_kernel void @local_singlethread_monotonic_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -5591,6 +7707,12 @@ define amdgpu_kernel void @local_singlethread_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -5607,6 +7729,12 @@ define amdgpu_kernel void @local_singlethread_monotonic_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -5631,12 +7759,16 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -5649,6 +7781,12 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX7-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -5666,6 +7804,12 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX10-WGP-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5681,6 +7825,12 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX10-CU-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5696,6 +7846,12 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -5713,6 +7869,12 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5728,6 +7890,12 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5743,6 +7911,12 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5758,6 +7932,12 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5773,6 +7953,12 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5788,6 +7974,12 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5803,6 +7995,12 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -5818,6 +8016,12 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -5834,6 +8038,12 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -5858,12 +8068,16 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_singlethread_release_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -5876,6 +8090,12 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_ret_cmpxchg( ; GFX7-LABEL: local_singlethread_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -5893,6 +8113,12 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_ret_cmpxchg( ; GFX10-WGP-LABEL: local_singlethread_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5908,6 +8134,12 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_ret_cmpxchg( ; GFX10-CU-LABEL: local_singlethread_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5923,6 +8155,12 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_singlethread_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -5940,6 +8178,12 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5955,6 +8199,12 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_singlethread_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5970,6 +8220,12 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_release_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5985,6 +8241,12 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_singlethread_release_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6000,6 +8262,12 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: local_singlethread_release_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -6015,6 +8283,12 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: local_singlethread_release_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6030,6 +8304,12 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_singlethread_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -6045,6 +8325,12 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_singlethread_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -6061,6 +8347,12 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -6085,12 +8377,16 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -6103,6 +8399,12 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX7-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -6120,6 +8422,12 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX10-WGP-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -6135,6 +8443,12 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX10-CU-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6150,6 +8464,12 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -6167,6 +8487,12 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6182,6 +8508,12 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6197,6 +8529,12 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6212,6 +8550,12 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6227,6 +8571,12 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -6242,6 +8592,12 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6257,6 +8613,12 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -6272,6 +8634,12 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -6288,6 +8656,12 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -6312,12 +8686,16 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -6330,6 +8708,12 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX7-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -6347,6 +8731,12 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX10-WGP-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -6362,6 +8752,12 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX10-CU-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6377,6 +8773,12 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -6394,6 +8796,12 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6409,6 +8817,12 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6424,6 +8838,12 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6439,6 +8859,12 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6454,6 +8880,12 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -6469,6 +8901,12 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6484,6 +8922,12 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -6499,6 +8943,12 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -6515,6 +8965,12 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -6539,12 +8995,16 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX6-LABEL: local_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -6557,6 +9017,12 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX7-LABEL: local_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -6574,6 +9040,12 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: local_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -6589,6 +9061,12 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX10-CU-LABEL: local_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6604,6 +9082,12 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_singlethread_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -6621,6 +9105,12 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6636,6 +9126,12 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6651,6 +9147,12 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6666,6 +9168,12 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6681,6 +9189,12 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: local_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -6696,6 +9210,12 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: local_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6711,6 +9231,12 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -6726,6 +9252,12 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -6742,6 +9274,12 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -6766,12 +9304,16 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_ret_cmpxchg( ; GFX6-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -6784,6 +9326,12 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_ret_cmpxchg( ; GFX7-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -6801,6 +9349,12 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -6816,6 +9370,12 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_ret_cmpxchg( ; GFX10-CU-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6831,6 +9391,12 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -6848,6 +9414,12 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6863,6 +9435,12 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6878,6 +9456,12 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6893,6 +9477,12 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6908,6 +9498,12 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -6923,6 +9519,12 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6938,6 +9540,12 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -6953,6 +9561,12 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -6969,6 +9583,12 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -6993,12 +9613,16 @@ define amdgpu_kernel void @local_singlethread_release_acquire_ret_cmpxchg( ; GFX6-LABEL: local_singlethread_release_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -7011,6 +9635,12 @@ define amdgpu_kernel void @local_singlethread_release_acquire_ret_cmpxchg( ; GFX7-LABEL: local_singlethread_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -7028,6 +9658,12 @@ define amdgpu_kernel void @local_singlethread_release_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: local_singlethread_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7043,6 +9679,12 @@ define amdgpu_kernel void @local_singlethread_release_acquire_ret_cmpxchg( ; GFX10-CU-LABEL: local_singlethread_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -7058,6 +9700,12 @@ define amdgpu_kernel void @local_singlethread_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_singlethread_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -7075,6 +9723,12 @@ define amdgpu_kernel void @local_singlethread_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7090,6 +9744,12 @@ define amdgpu_kernel void @local_singlethread_release_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_singlethread_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7105,6 +9765,12 @@ define amdgpu_kernel void @local_singlethread_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_release_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7120,6 +9786,12 @@ define amdgpu_kernel void @local_singlethread_release_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_singlethread_release_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7135,6 +9807,12 @@ define amdgpu_kernel void @local_singlethread_release_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: local_singlethread_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7150,6 +9828,12 @@ define amdgpu_kernel void @local_singlethread_release_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: local_singlethread_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -7165,6 +9849,12 @@ define amdgpu_kernel void @local_singlethread_release_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_singlethread_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -7180,6 +9870,12 @@ define amdgpu_kernel void @local_singlethread_release_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_singlethread_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -7196,6 +9892,12 @@ define amdgpu_kernel void @local_singlethread_release_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -7220,12 +9922,16 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX6-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -7238,6 +9944,12 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX7-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -7255,6 +9967,12 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7270,6 +9988,12 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX10-CU-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -7285,6 +10009,12 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -7302,6 +10032,12 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7317,6 +10053,12 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7332,6 +10074,12 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7347,6 +10095,12 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7362,6 +10116,12 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7377,6 +10137,12 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -7392,6 +10158,12 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -7407,6 +10179,12 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -7423,6 +10201,12 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -7447,12 +10231,16 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX6-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -7465,6 +10253,12 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX7-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -7482,6 +10276,12 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7497,6 +10297,12 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX10-CU-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -7512,6 +10318,12 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -7529,6 +10341,12 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7544,6 +10362,12 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7559,6 +10383,12 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7574,6 +10404,12 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7589,6 +10425,12 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7604,6 +10446,12 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -7619,6 +10467,12 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -7634,6 +10488,12 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -7650,6 +10510,12 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -7674,12 +10540,16 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -7692,6 +10562,12 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX7-LABEL: local_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -7709,6 +10585,12 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX10-WGP-LABEL: local_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7724,6 +10606,12 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX10-CU-LABEL: local_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -7739,6 +10627,12 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_singlethread_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -7756,6 +10650,12 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7771,6 +10671,12 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7786,6 +10692,12 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7801,6 +10713,12 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7816,6 +10734,12 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: local_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7831,6 +10755,12 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: local_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -7846,6 +10776,12 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -7861,6 +10797,12 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -7877,6 +10819,12 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -7901,12 +10849,16 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -7919,6 +10871,12 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX7-LABEL: local_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -7936,6 +10894,12 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX10-WGP-LABEL: local_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7951,6 +10915,12 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX10-CU-LABEL: local_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -7966,6 +10936,12 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_singlethread_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -7983,6 +10959,12 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7998,6 +10980,12 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8013,6 +11001,12 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8028,6 +11022,12 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8043,6 +11043,12 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: local_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -8058,6 +11064,12 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: local_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -8073,6 +11085,12 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -8088,6 +11106,12 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -8104,6 +11128,12 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -8128,12 +11158,16 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_singlethread_release_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -8146,6 +11180,12 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_ret_cmpxchg( ; GFX7-LABEL: local_singlethread_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -8163,6 +11203,12 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_ret_cmpxchg( ; GFX10-WGP-LABEL: local_singlethread_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -8178,6 +11224,12 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_ret_cmpxchg( ; GFX10-CU-LABEL: local_singlethread_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -8193,6 +11245,12 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_singlethread_release_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -8210,6 +11268,12 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8225,6 +11289,12 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_singlethread_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8240,6 +11310,12 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_release_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8255,6 +11331,12 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_singlethread_release_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8270,6 +11352,12 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: local_singlethread_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -8285,6 +11373,12 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: local_singlethread_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -8300,6 +11394,12 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_singlethread_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -8315,6 +11415,12 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_singlethread_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -8331,6 +11437,12 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -8355,12 +11467,16 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -8373,6 +11489,12 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-LABEL: local_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -8390,6 +11512,12 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-WGP-LABEL: local_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -8405,6 +11533,12 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-CU-LABEL: local_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -8420,6 +11554,12 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -8437,6 +11577,12 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8452,6 +11598,12 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8467,6 +11619,12 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8482,6 +11640,12 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8497,6 +11661,12 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: local_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -8512,6 +11682,12 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: local_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -8527,6 +11703,12 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -8542,6 +11724,12 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -8558,6 +11746,12 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -8582,12 +11776,16 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -8600,6 +11798,12 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -8617,6 +11821,12 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-WGP-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -8632,6 +11842,12 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-CU-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -8647,6 +11863,12 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -8664,6 +11886,12 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8679,6 +11907,12 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8694,6 +11928,12 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8709,6 +11949,12 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8724,6 +11970,12 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -8739,6 +11991,12 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -8754,6 +12012,12 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -8769,6 +12033,12 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -8785,6 +12055,12 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -8824,135 +12100,169 @@ define amdgpu_kernel void @local_singlethread_one_as_unordered_load( ; ; GFX7-LABEL: local_singlethread_one_as_unordered_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: ds_read_b32 v1, v0 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_one_as_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_singlethread_one_as_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_unordered_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_one_as_unordered_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_unordered_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_singlethread_one_as_unordered_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_singlethread_one_as_unordered_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_singlethread_one_as_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: ds_load_b32 v1, v0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: ds_store_b32 v0, v1 @@ -8960,11 +12270,15 @@ define amdgpu_kernel void @local_singlethread_one_as_unordered_load( ; ; GFX12-CU-LABEL: local_singlethread_one_as_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 @@ -8973,11 +12287,15 @@ define amdgpu_kernel void @local_singlethread_one_as_unordered_load( ; GFX1250-LABEL: local_singlethread_one_as_unordered_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: ds_store_b32 v0, v1 @@ -9008,135 +12326,169 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_load( ; ; GFX7-LABEL: local_singlethread_one_as_monotonic_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: ds_read_b32 v1, v0 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_one_as_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_singlethread_one_as_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_monotonic_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_singlethread_one_as_monotonic_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_singlethread_one_as_monotonic_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_singlethread_one_as_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: ds_load_b32 v1, v0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: ds_store_b32 v0, v1 @@ -9144,11 +12496,15 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_load( ; ; GFX12-CU-LABEL: local_singlethread_one_as_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 @@ -9157,11 +12513,15 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_load( ; GFX1250-LABEL: local_singlethread_one_as_monotonic_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: ds_store_b32 v0, v1 @@ -9192,135 +12552,169 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_load( ; ; GFX7-LABEL: local_singlethread_one_as_acquire_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: ds_read_b32 v1, v0 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_one_as_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_singlethread_one_as_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acquire_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_acquire_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_singlethread_one_as_acquire_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_singlethread_one_as_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_singlethread_one_as_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: ds_load_b32 v1, v0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: ds_store_b32 v0, v1 @@ -9328,11 +12722,15 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_load( ; ; GFX12-CU-LABEL: local_singlethread_one_as_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 @@ -9341,11 +12739,15 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_load( ; GFX1250-LABEL: local_singlethread_one_as_acquire_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: ds_store_b32 v0, v1 @@ -9376,135 +12778,169 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_load( ; ; GFX7-LABEL: local_singlethread_one_as_seq_cst_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: ds_read_b32 v1, v0 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_one_as_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_singlethread_one_as_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_seq_cst_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_singlethread_one_as_seq_cst_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_singlethread_one_as_seq_cst_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_singlethread_one_as_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: ds_load_b32 v1, v0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: ds_store_b32 v0, v1 @@ -9512,11 +12948,15 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_load( ; ; GFX12-CU-LABEL: local_singlethread_one_as_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 @@ -9525,11 +12965,15 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_load( ; GFX1250-LABEL: local_singlethread_one_as_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: ds_store_b32 v0, v1 @@ -9544,12 +12988,14 @@ entry: define amdgpu_kernel void @local_singlethread_one_as_unordered_store( ; GFX6-LABEL: local_singlethread_one_as_unordered_store: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm @@ -9557,6 +13003,10 @@ define amdgpu_kernel void @local_singlethread_one_as_unordered_store( ; GFX7-LABEL: local_singlethread_one_as_unordered_store: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -9568,6 +13018,10 @@ define amdgpu_kernel void @local_singlethread_one_as_unordered_store( ; GFX10-WGP-LABEL: local_singlethread_one_as_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 @@ -9578,6 +13032,10 @@ define amdgpu_kernel void @local_singlethread_one_as_unordered_store( ; GFX10-CU-LABEL: local_singlethread_one_as_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 @@ -9588,6 +13046,10 @@ define amdgpu_kernel void @local_singlethread_one_as_unordered_store( ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_unordered_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -9599,6 +13061,10 @@ define amdgpu_kernel void @local_singlethread_one_as_unordered_store( ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -9609,6 +13075,10 @@ define amdgpu_kernel void @local_singlethread_one_as_unordered_store( ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -9619,6 +13089,10 @@ define amdgpu_kernel void @local_singlethread_one_as_unordered_store( ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_one_as_unordered_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -9629,6 +13103,10 @@ define amdgpu_kernel void @local_singlethread_one_as_unordered_store( ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_unordered_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -9639,6 +13117,10 @@ define amdgpu_kernel void @local_singlethread_one_as_unordered_store( ; GFX11-WGP-LABEL: local_singlethread_one_as_unordered_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -9649,6 +13131,10 @@ define amdgpu_kernel void @local_singlethread_one_as_unordered_store( ; GFX11-CU-LABEL: local_singlethread_one_as_unordered_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -9659,6 +13145,10 @@ define amdgpu_kernel void @local_singlethread_one_as_unordered_store( ; GFX12-WGP-LABEL: local_singlethread_one_as_unordered_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -9669,6 +13159,10 @@ define amdgpu_kernel void @local_singlethread_one_as_unordered_store( ; GFX12-CU-LABEL: local_singlethread_one_as_unordered_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -9680,6 +13174,10 @@ define amdgpu_kernel void @local_singlethread_one_as_unordered_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 @@ -9695,12 +13193,14 @@ entry: define amdgpu_kernel void @local_singlethread_one_as_monotonic_store( ; GFX6-LABEL: local_singlethread_one_as_monotonic_store: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm @@ -9708,6 +13208,10 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_store( ; GFX7-LABEL: local_singlethread_one_as_monotonic_store: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -9719,6 +13223,10 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_store( ; GFX10-WGP-LABEL: local_singlethread_one_as_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 @@ -9729,6 +13237,10 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_store( ; GFX10-CU-LABEL: local_singlethread_one_as_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 @@ -9739,6 +13251,10 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_store( ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_monotonic_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -9750,6 +13266,10 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_store( ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -9760,6 +13280,10 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_store( ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -9770,6 +13294,10 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_store( ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -9780,6 +13308,10 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_store( ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -9790,6 +13322,10 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_store( ; GFX11-WGP-LABEL: local_singlethread_one_as_monotonic_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -9800,6 +13336,10 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_store( ; GFX11-CU-LABEL: local_singlethread_one_as_monotonic_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -9810,6 +13350,10 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_store( ; GFX12-WGP-LABEL: local_singlethread_one_as_monotonic_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -9820,6 +13364,10 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_store( ; GFX12-CU-LABEL: local_singlethread_one_as_monotonic_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -9831,6 +13379,10 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 @@ -9846,12 +13398,14 @@ entry: define amdgpu_kernel void @local_singlethread_one_as_release_store( ; GFX6-LABEL: local_singlethread_one_as_release_store: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm @@ -9859,6 +13413,10 @@ define amdgpu_kernel void @local_singlethread_one_as_release_store( ; GFX7-LABEL: local_singlethread_one_as_release_store: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -9870,6 +13428,10 @@ define amdgpu_kernel void @local_singlethread_one_as_release_store( ; GFX10-WGP-LABEL: local_singlethread_one_as_release_store: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 @@ -9880,6 +13442,10 @@ define amdgpu_kernel void @local_singlethread_one_as_release_store( ; GFX10-CU-LABEL: local_singlethread_one_as_release_store: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 @@ -9890,6 +13456,10 @@ define amdgpu_kernel void @local_singlethread_one_as_release_store( ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_release_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -9901,6 +13471,10 @@ define amdgpu_kernel void @local_singlethread_one_as_release_store( ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -9911,6 +13485,10 @@ define amdgpu_kernel void @local_singlethread_one_as_release_store( ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -9921,6 +13499,10 @@ define amdgpu_kernel void @local_singlethread_one_as_release_store( ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -9931,6 +13513,10 @@ define amdgpu_kernel void @local_singlethread_one_as_release_store( ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_release_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -9941,6 +13527,10 @@ define amdgpu_kernel void @local_singlethread_one_as_release_store( ; GFX11-WGP-LABEL: local_singlethread_one_as_release_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -9951,6 +13541,10 @@ define amdgpu_kernel void @local_singlethread_one_as_release_store( ; GFX11-CU-LABEL: local_singlethread_one_as_release_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -9961,6 +13555,10 @@ define amdgpu_kernel void @local_singlethread_one_as_release_store( ; GFX12-WGP-LABEL: local_singlethread_one_as_release_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -9971,6 +13569,10 @@ define amdgpu_kernel void @local_singlethread_one_as_release_store( ; GFX12-CU-LABEL: local_singlethread_one_as_release_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -9982,6 +13584,10 @@ define amdgpu_kernel void @local_singlethread_one_as_release_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 @@ -9997,12 +13603,14 @@ entry: define amdgpu_kernel void @local_singlethread_one_as_seq_cst_store( ; GFX6-LABEL: local_singlethread_one_as_seq_cst_store: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm @@ -10010,6 +13618,10 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_store( ; GFX7-LABEL: local_singlethread_one_as_seq_cst_store: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10021,6 +13633,10 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_store( ; GFX10-WGP-LABEL: local_singlethread_one_as_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 @@ -10031,6 +13647,10 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_store( ; GFX10-CU-LABEL: local_singlethread_one_as_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 @@ -10041,6 +13661,10 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_store( ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_seq_cst_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -10052,6 +13676,10 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_store( ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -10062,6 +13690,10 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_store( ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -10072,6 +13704,10 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_store( ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -10082,6 +13718,10 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_store( ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -10092,6 +13732,10 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_store( ; GFX11-WGP-LABEL: local_singlethread_one_as_seq_cst_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -10102,6 +13746,10 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_store( ; GFX11-CU-LABEL: local_singlethread_one_as_seq_cst_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -10112,6 +13760,10 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_store( ; GFX12-WGP-LABEL: local_singlethread_one_as_seq_cst_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -10122,6 +13774,10 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_store( ; GFX12-CU-LABEL: local_singlethread_one_as_seq_cst_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -10133,6 +13789,10 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 @@ -10149,133 +13809,183 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_atomicrmw( ; GFX6-LABEL: local_singlethread_one_as_monotonic_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_one_as_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_one_as_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_singlethread_one_as_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_monotonic_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_singlethread_one_as_monotonic_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_singlethread_one_as_monotonic_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_singlethread_one_as_monotonic_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_singlethread_one_as_monotonic_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm @@ -10283,10 +13993,14 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_atomicrmw( ; GFX1250-LABEL: local_singlethread_one_as_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s0 ; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX1250-NEXT: s_endpgm @@ -10300,133 +14014,183 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_atomicrmw( ; GFX6-LABEL: local_singlethread_one_as_acquire_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_one_as_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_one_as_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_singlethread_one_as_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acquire_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_acquire_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_singlethread_one_as_acquire_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_singlethread_one_as_acquire_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_singlethread_one_as_acquire_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_singlethread_one_as_acquire_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm @@ -10434,10 +14198,14 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_atomicrmw( ; GFX1250-LABEL: local_singlethread_one_as_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s0 ; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX1250-NEXT: s_endpgm @@ -10451,133 +14219,183 @@ define amdgpu_kernel void @local_singlethread_one_as_release_atomicrmw( ; GFX6-LABEL: local_singlethread_one_as_release_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_one_as_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_one_as_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_singlethread_one_as_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_release_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_release_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_singlethread_one_as_release_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_singlethread_one_as_release_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_singlethread_one_as_release_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_singlethread_one_as_release_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm @@ -10585,10 +14403,14 @@ define amdgpu_kernel void @local_singlethread_one_as_release_atomicrmw( ; GFX1250-LABEL: local_singlethread_one_as_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s0 ; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX1250-NEXT: s_endpgm @@ -10602,133 +14424,183 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_atomicrmw( ; GFX6-LABEL: local_singlethread_one_as_acq_rel_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_one_as_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_one_as_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_singlethread_one_as_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acq_rel_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_singlethread_one_as_acq_rel_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_singlethread_one_as_acq_rel_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_singlethread_one_as_acq_rel_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_singlethread_one_as_acq_rel_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm @@ -10736,10 +14608,14 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_atomicrmw( ; GFX1250-LABEL: local_singlethread_one_as_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s0 ; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX1250-NEXT: s_endpgm @@ -10753,133 +14629,183 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_atomicrmw( ; GFX6-LABEL: local_singlethread_one_as_seq_cst_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_one_as_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_one_as_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_singlethread_one_as_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_seq_cst_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_singlethread_one_as_seq_cst_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_singlethread_one_as_seq_cst_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_singlethread_one_as_seq_cst_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_singlethread_one_as_seq_cst_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm @@ -10887,10 +14813,14 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_atomicrmw( ; GFX1250-LABEL: local_singlethread_one_as_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s0 ; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX1250-NEXT: s_endpgm @@ -10904,11 +14834,13 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_ret_atomicrmw( ; GFX6-LABEL: local_singlethread_one_as_acquire_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 ; GFX6-NEXT: s_mov_b32 m0, -1 @@ -10920,6 +14852,10 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_ret_atomicrmw( ; GFX7-LABEL: local_singlethread_one_as_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10935,6 +14871,10 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_ret_atomicrmw( ; GFX10-WGP-LABEL: local_singlethread_one_as_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -10948,6 +14888,10 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_ret_atomicrmw( ; GFX10-CU-LABEL: local_singlethread_one_as_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -10961,6 +14905,10 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -10976,6 +14924,10 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -10989,6 +14941,10 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -11002,6 +14958,10 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 @@ -11015,6 +14975,10 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_acquire_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 @@ -11028,6 +14992,10 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_ret_atomicrmw( ; GFX11-WGP-LABEL: local_singlethread_one_as_acquire_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -11041,6 +15009,10 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_ret_atomicrmw( ; GFX11-CU-LABEL: local_singlethread_one_as_acquire_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -11054,6 +15026,10 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_ret_atomicrmw( ; GFX12-WGP-LABEL: local_singlethread_one_as_acquire_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -11067,6 +15043,10 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_ret_atomicrmw( ; GFX12-CU-LABEL: local_singlethread_one_as_acquire_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -11081,6 +15061,10 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_ret_atomicrmw( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 @@ -11101,11 +15085,13 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX6-LABEL: local_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 ; GFX6-NEXT: s_mov_b32 m0, -1 @@ -11117,6 +15103,10 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX7-LABEL: local_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11132,6 +15122,10 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX10-WGP-LABEL: local_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -11145,6 +15139,10 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX10-CU-LABEL: local_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -11158,6 +15156,10 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -11173,6 +15175,10 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -11186,6 +15192,10 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -11199,6 +15209,10 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 @@ -11212,6 +15226,10 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 @@ -11225,6 +15243,10 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX11-WGP-LABEL: local_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -11238,6 +15260,10 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX11-CU-LABEL: local_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -11251,6 +15277,10 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX12-WGP-LABEL: local_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -11264,6 +15294,10 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-LABEL: local_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -11278,6 +15312,10 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 @@ -11298,11 +15336,13 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX6-LABEL: local_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 ; GFX6-NEXT: s_mov_b32 m0, -1 @@ -11314,6 +15354,10 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX7-LABEL: local_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11329,6 +15373,10 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX10-WGP-LABEL: local_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -11342,6 +15390,10 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX10-CU-LABEL: local_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -11355,6 +15407,10 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -11370,6 +15426,10 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -11383,6 +15443,10 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -11396,6 +15460,10 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 @@ -11409,6 +15477,10 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 @@ -11422,6 +15494,10 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX11-WGP-LABEL: local_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -11435,6 +15511,10 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX11-CU-LABEL: local_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -11448,6 +15528,10 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX12-WGP-LABEL: local_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -11461,6 +15545,10 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-LABEL: local_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -11475,6 +15563,10 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 @@ -11495,12 +15587,16 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_cmpxchg ; GFX6-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -11508,12 +15604,18 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_cmpxchg ; ; GFX7-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -11521,11 +15623,17 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_cmpxchg ; ; GFX10-WGP-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -11533,11 +15641,17 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_cmpxchg ; ; GFX10-CU-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -11545,12 +15659,18 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_cmpxchg ; ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -11558,11 +15678,17 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_cmpxchg ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -11570,11 +15696,17 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_cmpxchg ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -11582,11 +15714,17 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_cmpxchg ; ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -11594,11 +15732,17 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_cmpxchg ; ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -11606,48 +15750,72 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_cmpxchg ; ; GFX11-WGP-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -11655,12 +15823,18 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_cmpxchg ; GFX1250-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -11675,12 +15849,16 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX6-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -11688,12 +15866,18 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_cmpxchg( ; ; GFX7-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -11701,11 +15885,17 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -11713,11 +15903,17 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -11725,12 +15921,18 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -11738,11 +15940,17 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -11750,11 +15958,17 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -11762,11 +15976,17 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -11774,11 +15994,17 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -11786,48 +16012,72 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -11835,12 +16085,18 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX1250-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -11855,12 +16111,16 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_cmpxchg( ; GFX6-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -11868,12 +16128,18 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_cmpxchg( ; ; GFX7-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -11881,11 +16147,17 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -11893,11 +16165,17 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -11905,12 +16183,18 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -11918,11 +16202,17 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -11930,11 +16220,17 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -11942,11 +16238,17 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -11954,11 +16256,17 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -11966,48 +16274,72 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -12015,12 +16347,18 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_cmpxchg( ; GFX1250-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -12035,12 +16373,16 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX6-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12048,12 +16390,18 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX7-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12061,11 +16409,17 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12073,11 +16427,17 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12085,12 +16445,18 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12098,11 +16464,17 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12110,11 +16482,17 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12122,11 +16500,17 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12134,11 +16518,17 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12146,48 +16536,72 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -12195,12 +16609,18 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX1250-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -12215,12 +16635,16 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX6-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12228,12 +16652,18 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX7-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12241,11 +16671,17 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12253,11 +16689,17 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12265,12 +16707,18 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12278,11 +16726,17 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12290,11 +16744,17 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12302,11 +16762,17 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12314,11 +16780,17 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12326,48 +16798,72 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -12375,12 +16871,18 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX1250-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -12395,12 +16897,16 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX6-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12408,12 +16914,18 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_cmpxchg( ; ; GFX7-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12421,11 +16933,17 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12433,11 +16951,17 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12445,12 +16969,18 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12458,11 +16988,17 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12470,11 +17006,17 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12482,11 +17024,17 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12494,11 +17042,17 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12506,48 +17060,72 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -12555,12 +17133,18 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX1250-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -12575,12 +17159,16 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX6-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12588,12 +17176,18 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_cmpxchg( ; ; GFX7-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12601,11 +17195,17 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12613,11 +17213,17 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12625,12 +17231,18 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12638,11 +17250,17 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12650,11 +17268,17 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12662,11 +17286,17 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12674,11 +17304,17 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12686,48 +17322,72 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -12735,12 +17395,18 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX1250-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -12755,12 +17421,16 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_cmpxchg( ; GFX6-LABEL: local_singlethread_one_as_release_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12768,12 +17438,18 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_cmpxchg( ; ; GFX7-LABEL: local_singlethread_one_as_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12781,11 +17457,17 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_singlethread_one_as_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12793,11 +17475,17 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_singlethread_one_as_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12805,12 +17493,18 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12818,11 +17512,17 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12830,11 +17530,17 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12842,11 +17548,17 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12854,11 +17566,17 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_release_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12866,48 +17584,72 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_singlethread_one_as_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_singlethread_one_as_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_singlethread_one_as_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_singlethread_one_as_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -12915,12 +17657,18 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_cmpxchg( ; GFX1250-LABEL: local_singlethread_one_as_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -12935,12 +17683,16 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX6-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12948,12 +17700,18 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_cmpxchg( ; ; GFX7-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12961,11 +17719,17 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12973,11 +17737,17 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12985,12 +17755,18 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12998,11 +17774,17 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13010,11 +17792,17 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13022,11 +17810,17 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13034,11 +17828,17 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13046,48 +17846,72 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -13095,12 +17919,18 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX1250-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -13115,12 +17945,16 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX6-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13128,12 +17962,18 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_cmpxchg( ; ; GFX7-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13141,11 +17981,17 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13153,11 +17999,17 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13165,12 +18017,18 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13178,11 +18036,17 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13190,11 +18054,17 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13202,11 +18072,17 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13214,11 +18090,17 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13226,48 +18108,72 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -13275,12 +18181,18 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX1250-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -13295,12 +18207,16 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX6-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13308,12 +18224,18 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13321,11 +18243,17 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13333,11 +18261,17 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13345,12 +18279,18 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13358,11 +18298,17 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13370,11 +18316,17 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13382,11 +18334,17 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13394,11 +18352,17 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13406,48 +18370,72 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -13455,12 +18443,18 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX1250-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -13475,12 +18469,16 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX6-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13488,12 +18486,18 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13501,11 +18505,17 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13513,11 +18523,17 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13525,12 +18541,18 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13538,11 +18560,17 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13550,11 +18578,17 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13562,11 +18596,17 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13574,11 +18614,17 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13586,48 +18632,72 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -13635,12 +18705,18 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX1250-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -13655,12 +18731,16 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX6-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13668,12 +18748,18 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13681,11 +18767,17 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13693,11 +18785,17 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13705,12 +18803,18 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13718,11 +18822,17 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13730,11 +18840,17 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13742,11 +18858,17 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13754,11 +18876,17 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13766,48 +18894,72 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -13815,12 +18967,18 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX1250-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -13835,12 +18993,16 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX6-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13848,12 +19010,18 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13861,11 +19029,17 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13873,11 +19047,17 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13885,12 +19065,18 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13898,11 +19084,17 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13910,11 +19102,17 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13922,11 +19120,17 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13934,11 +19138,17 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13946,48 +19156,72 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -13995,12 +19229,18 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX1250-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -14015,12 +19255,16 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX6-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14028,12 +19272,18 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14041,11 +19291,17 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14053,11 +19309,17 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14065,12 +19327,18 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14078,11 +19346,17 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14090,11 +19364,17 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14102,11 +19382,17 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14114,11 +19400,17 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14126,48 +19418,72 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -14175,12 +19491,18 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX1250-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -14195,12 +19517,16 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_ret_cmp ; GFX6-LABEL: local_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -14213,6 +19539,12 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_ret_cmp ; GFX7-LABEL: local_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -14230,6 +19562,12 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_ret_cmp ; GFX10-WGP-LABEL: local_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -14245,6 +19583,12 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_ret_cmp ; GFX10-CU-LABEL: local_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -14260,6 +19604,12 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_ret_cmp ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -14277,6 +19627,12 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_ret_cmp ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14292,6 +19648,12 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_ret_cmp ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14307,6 +19669,12 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_ret_cmp ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14322,6 +19690,12 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_ret_cmp ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14337,6 +19711,12 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_ret_cmp ; GFX11-WGP-LABEL: local_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -14352,6 +19732,12 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_ret_cmp ; GFX11-CU-LABEL: local_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -14367,6 +19753,12 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_ret_cmp ; GFX12-WGP-LABEL: local_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -14382,6 +19774,12 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_ret_cmp ; GFX12-CU-LABEL: local_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -14398,6 +19796,12 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_ret_cmp ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -14422,12 +19826,16 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_ret_cmpxc ; GFX6-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -14440,6 +19848,12 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_ret_cmpxc ; GFX7-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -14457,6 +19871,12 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_ret_cmpxc ; GFX10-WGP-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -14472,6 +19892,12 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_ret_cmpxc ; GFX10-CU-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -14487,6 +19913,12 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_ret_cmpxc ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -14504,6 +19936,12 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_ret_cmpxc ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14519,6 +19957,12 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_ret_cmpxc ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14534,6 +19978,12 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_ret_cmpxc ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14549,6 +19999,12 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_ret_cmpxc ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14564,6 +20020,12 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_ret_cmpxc ; GFX11-WGP-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -14579,6 +20041,12 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_ret_cmpxc ; GFX11-CU-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -14594,6 +20062,12 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_ret_cmpxc ; GFX12-WGP-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -14609,6 +20083,12 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_ret_cmpxc ; GFX12-CU-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -14625,6 +20105,12 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_ret_cmpxc ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -14649,12 +20135,16 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_ret_cmpxc ; GFX6-LABEL: local_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -14667,6 +20157,12 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_ret_cmpxc ; GFX7-LABEL: local_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -14684,6 +20180,12 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_ret_cmpxc ; GFX10-WGP-LABEL: local_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -14699,6 +20201,12 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_ret_cmpxc ; GFX10-CU-LABEL: local_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -14714,6 +20222,12 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_ret_cmpxc ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -14731,6 +20245,12 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_ret_cmpxc ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14746,6 +20266,12 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_ret_cmpxc ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14761,6 +20287,12 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_ret_cmpxc ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14776,6 +20308,12 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_ret_cmpxc ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14791,6 +20329,12 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_ret_cmpxc ; GFX11-WGP-LABEL: local_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -14806,6 +20350,12 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_ret_cmpxc ; GFX11-CU-LABEL: local_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -14821,6 +20371,12 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_ret_cmpxc ; GFX12-WGP-LABEL: local_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -14836,6 +20392,12 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_ret_cmpxc ; GFX12-CU-LABEL: local_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -14852,6 +20414,12 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_ret_cmpxc ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -14876,12 +20444,16 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_ret_cmpxc ; GFX6-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -14894,6 +20466,12 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_ret_cmpxc ; GFX7-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -14911,6 +20489,12 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_ret_cmpxc ; GFX10-WGP-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -14926,6 +20510,12 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_ret_cmpxc ; GFX10-CU-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -14941,6 +20531,12 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_ret_cmpxc ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -14958,6 +20554,12 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_ret_cmpxc ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14973,6 +20575,12 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_ret_cmpxc ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14988,6 +20596,12 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_ret_cmpxc ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15003,6 +20617,12 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_ret_cmpxc ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15018,6 +20638,12 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_ret_cmpxc ; GFX11-WGP-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -15033,6 +20659,12 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_ret_cmpxc ; GFX11-CU-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -15048,6 +20680,12 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_ret_cmpxc ; GFX12-WGP-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -15063,6 +20701,12 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_ret_cmpxc ; GFX12-CU-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -15079,6 +20723,12 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_ret_cmpxc ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -15103,12 +20753,16 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_ret_cmpxc ; GFX6-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -15121,6 +20775,12 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_ret_cmpxc ; GFX7-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -15138,6 +20798,12 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_ret_cmpxc ; GFX10-WGP-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -15153,6 +20819,12 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_ret_cmpxc ; GFX10-CU-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -15168,6 +20840,12 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_ret_cmpxc ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -15185,6 +20863,12 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_ret_cmpxc ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15200,6 +20884,12 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_ret_cmpxc ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15215,6 +20905,12 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_ret_cmpxc ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15230,6 +20926,12 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_ret_cmpxc ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15245,6 +20947,12 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_ret_cmpxc ; GFX11-WGP-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -15260,6 +20968,12 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_ret_cmpxc ; GFX11-CU-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -15275,6 +20989,12 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_ret_cmpxc ; GFX12-WGP-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -15290,6 +21010,12 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_ret_cmpxc ; GFX12-CU-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -15306,6 +21032,12 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_ret_cmpxc ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -15330,12 +21062,16 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_ret_cmpxc ; GFX6-LABEL: local_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -15348,6 +21084,12 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_ret_cmpxc ; GFX7-LABEL: local_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -15365,6 +21107,12 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_ret_cmpxc ; GFX10-WGP-LABEL: local_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -15380,6 +21128,12 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_ret_cmpxc ; GFX10-CU-LABEL: local_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -15395,6 +21149,12 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_ret_cmpxc ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -15412,6 +21172,12 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_ret_cmpxc ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15427,6 +21193,12 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_ret_cmpxc ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15442,6 +21214,12 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_ret_cmpxc ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15457,6 +21235,12 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_ret_cmpxc ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15472,6 +21256,12 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_ret_cmpxc ; GFX11-WGP-LABEL: local_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -15487,6 +21277,12 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_ret_cmpxc ; GFX11-CU-LABEL: local_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -15502,6 +21298,12 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_ret_cmpxc ; GFX12-WGP-LABEL: local_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -15517,6 +21319,12 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_ret_cmpxc ; GFX12-CU-LABEL: local_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -15533,6 +21341,12 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_ret_cmpxc ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -15557,12 +21371,16 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_ret_cmpxchg ; GFX6-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -15575,6 +21393,12 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_ret_cmpxchg ; GFX7-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -15592,6 +21416,12 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_ret_cmpxchg ; GFX10-WGP-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -15607,6 +21437,12 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_ret_cmpxchg ; GFX10-CU-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -15622,6 +21458,12 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_ret_cmpxchg ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -15639,6 +21481,12 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_ret_cmpxchg ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15654,6 +21502,12 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_ret_cmpxchg ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15669,6 +21523,12 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_ret_cmpxchg ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15684,6 +21544,12 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_ret_cmpxchg ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15699,6 +21565,12 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_ret_cmpxchg ; GFX11-WGP-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -15714,6 +21586,12 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_ret_cmpxchg ; GFX11-CU-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -15729,6 +21607,12 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_ret_cmpxchg ; GFX12-WGP-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -15744,6 +21628,12 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_ret_cmpxchg ; GFX12-CU-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -15760,6 +21650,12 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_ret_cmpxchg ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -15784,12 +21680,16 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_ret_cmpxchg ; GFX6-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -15802,6 +21702,12 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_ret_cmpxchg ; GFX7-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -15819,6 +21725,12 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_ret_cmpxchg ; GFX10-WGP-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -15834,6 +21746,12 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_ret_cmpxchg ; GFX10-CU-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -15849,6 +21767,12 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_ret_cmpxchg ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -15866,6 +21790,12 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_ret_cmpxchg ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15881,6 +21811,12 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_ret_cmpxchg ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15896,6 +21832,12 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_ret_cmpxchg ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15911,6 +21853,12 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_ret_cmpxchg ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15926,6 +21874,12 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_ret_cmpxchg ; GFX11-WGP-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -15941,6 +21895,12 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_ret_cmpxchg ; GFX11-CU-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -15956,6 +21916,12 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_ret_cmpxchg ; GFX12-WGP-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -15971,6 +21937,12 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_ret_cmpxchg ; GFX12-CU-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -15987,6 +21959,12 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_ret_cmpxchg ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -16011,12 +21989,16 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg ; GFX6-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -16029,6 +22011,12 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg ; GFX7-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -16046,6 +22034,12 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg ; GFX10-WGP-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16061,6 +22055,12 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg ; GFX10-CU-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16076,6 +22076,12 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -16093,6 +22099,12 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16108,6 +22120,12 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16123,6 +22141,12 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16138,6 +22162,12 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16153,6 +22183,12 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg ; GFX11-WGP-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16168,6 +22204,12 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg ; GFX11-CU-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16183,6 +22225,12 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg ; GFX12-WGP-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -16198,6 +22246,12 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg ; GFX12-CU-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -16214,6 +22268,12 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -16238,12 +22298,16 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg ; GFX6-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -16256,6 +22320,12 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg ; GFX7-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -16273,6 +22343,12 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg ; GFX10-WGP-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16288,6 +22364,12 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg ; GFX10-CU-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16303,6 +22385,12 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -16320,6 +22408,12 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16335,6 +22429,12 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16350,6 +22450,12 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16365,6 +22471,12 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16380,6 +22492,12 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg ; GFX11-WGP-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16395,6 +22513,12 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg ; GFX11-CU-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16410,6 +22534,12 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg ; GFX12-WGP-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -16425,6 +22555,12 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg ; GFX12-CU-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -16441,6 +22577,12 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -16465,12 +22607,16 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_ret_cmpxc ; GFX6-LABEL: local_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -16483,6 +22629,12 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_ret_cmpxc ; GFX7-LABEL: local_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -16500,6 +22652,12 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_ret_cmpxc ; GFX10-WGP-LABEL: local_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16515,6 +22673,12 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_ret_cmpxc ; GFX10-CU-LABEL: local_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16530,6 +22694,12 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_ret_cmpxc ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -16547,6 +22717,12 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_ret_cmpxc ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16562,6 +22738,12 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_ret_cmpxc ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16577,6 +22759,12 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_ret_cmpxc ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16592,6 +22780,12 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_ret_cmpxc ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16607,6 +22801,12 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_ret_cmpxc ; GFX11-WGP-LABEL: local_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16622,6 +22822,12 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_ret_cmpxc ; GFX11-CU-LABEL: local_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16637,6 +22843,12 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_ret_cmpxc ; GFX12-WGP-LABEL: local_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -16652,6 +22864,12 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_ret_cmpxc ; GFX12-CU-LABEL: local_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -16668,6 +22886,12 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_ret_cmpxc ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -16692,12 +22916,16 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg ; GFX6-LABEL: local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -16710,6 +22938,12 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg ; GFX7-LABEL: local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -16727,6 +22961,12 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg ; GFX10-WGP-LABEL: local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16742,6 +22982,12 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg ; GFX10-CU-LABEL: local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16757,6 +23003,12 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -16774,6 +23026,12 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16789,6 +23047,12 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16804,6 +23068,12 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16819,6 +23089,12 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16834,6 +23110,12 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg ; GFX11-WGP-LABEL: local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16849,6 +23131,12 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg ; GFX11-CU-LABEL: local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16864,6 +23152,12 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg ; GFX12-WGP-LABEL: local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -16879,6 +23173,12 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg ; GFX12-CU-LABEL: local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -16895,6 +23195,12 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -16919,12 +23225,16 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_ret_cmpxchg ; GFX6-LABEL: local_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -16937,6 +23247,12 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_ret_cmpxchg ; GFX7-LABEL: local_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -16954,6 +23270,12 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_ret_cmpxchg ; GFX10-WGP-LABEL: local_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16969,6 +23291,12 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_ret_cmpxchg ; GFX10-CU-LABEL: local_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16984,6 +23312,12 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_ret_cmpxchg ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -17001,6 +23335,12 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_ret_cmpxchg ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17016,6 +23356,12 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_ret_cmpxchg ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17031,6 +23377,12 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_ret_cmpxchg ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17046,6 +23398,12 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_ret_cmpxchg ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17061,6 +23419,12 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_ret_cmpxchg ; GFX11-WGP-LABEL: local_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17076,6 +23440,12 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_ret_cmpxchg ; GFX11-CU-LABEL: local_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17091,6 +23461,12 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_ret_cmpxchg ; GFX12-WGP-LABEL: local_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -17106,6 +23482,12 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_ret_cmpxchg ; GFX12-CU-LABEL: local_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -17122,6 +23504,12 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_ret_cmpxchg ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -17146,12 +23534,16 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg ; GFX6-LABEL: local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -17164,6 +23556,12 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg ; GFX7-LABEL: local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -17181,6 +23579,12 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg ; GFX10-WGP-LABEL: local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17196,6 +23600,12 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg ; GFX10-CU-LABEL: local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17211,6 +23621,12 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -17228,6 +23644,12 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17243,6 +23665,12 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17258,6 +23686,12 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17273,6 +23707,12 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17288,6 +23728,12 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg ; GFX11-WGP-LABEL: local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17303,6 +23749,12 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg ; GFX11-CU-LABEL: local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17318,6 +23770,12 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg ; GFX12-WGP-LABEL: local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -17333,6 +23791,12 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg ; GFX12-CU-LABEL: local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -17349,6 +23813,12 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -17373,12 +23843,16 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg ; GFX6-LABEL: local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -17391,6 +23865,12 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg ; GFX7-LABEL: local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -17408,6 +23888,12 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg ; GFX10-WGP-LABEL: local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17423,6 +23909,12 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg ; GFX10-CU-LABEL: local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17438,6 +23930,12 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -17455,6 +23953,12 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17470,6 +23974,12 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17485,6 +23995,12 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17500,6 +24016,12 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17515,6 +24037,12 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg ; GFX11-WGP-LABEL: local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17530,6 +24058,12 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg ; GFX11-CU-LABEL: local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17545,6 +24079,12 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg ; GFX12-WGP-LABEL: local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -17560,6 +24100,12 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg ; GFX12-CU-LABEL: local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -17576,6 +24122,12 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll index a4be523b566d..bd22e80baf9e 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll @@ -33,135 +33,169 @@ define amdgpu_kernel void @local_system_unordered_load( ; ; GFX7-LABEL: local_system_unordered_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: ds_read_b32 v1, v0 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_system_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_system_unordered_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_system_unordered_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_unordered_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_system_unordered_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_system_unordered_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_system_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: ds_load_b32 v1, v0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: ds_store_b32 v0, v1 @@ -169,11 +203,15 @@ define amdgpu_kernel void @local_system_unordered_load( ; ; GFX12-CU-LABEL: local_system_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 @@ -182,11 +220,15 @@ define amdgpu_kernel void @local_system_unordered_load( ; GFX1250-LABEL: local_system_unordered_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: ds_store_b32 v0, v1 @@ -217,135 +259,169 @@ define amdgpu_kernel void @local_system_monotonic_load( ; ; GFX7-LABEL: local_system_monotonic_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: ds_read_b32 v1, v0 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_system_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_system_monotonic_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_system_monotonic_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_monotonic_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_system_monotonic_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_system_monotonic_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_system_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: ds_load_b32 v1, v0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: ds_store_b32 v0, v1 @@ -353,11 +429,15 @@ define amdgpu_kernel void @local_system_monotonic_load( ; ; GFX12-CU-LABEL: local_system_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 @@ -366,11 +446,15 @@ define amdgpu_kernel void @local_system_monotonic_load( ; GFX1250-LABEL: local_system_monotonic_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: ds_store_b32 v0, v1 @@ -401,10 +485,13 @@ define amdgpu_kernel void @local_system_acquire_load( ; ; GFX7-LABEL: local_system_acquire_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: ds_read_b32 v1, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -415,9 +502,12 @@ define amdgpu_kernel void @local_system_acquire_load( ; ; GFX10-WGP-LABEL: local_system_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -428,9 +518,12 @@ define amdgpu_kernel void @local_system_acquire_load( ; ; GFX10-CU-LABEL: local_system_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -440,10 +533,13 @@ define amdgpu_kernel void @local_system_acquire_load( ; ; SKIP-CACHE-INV-LABEL: local_system_acquire_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -454,9 +550,12 @@ define amdgpu_kernel void @local_system_acquire_load( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -466,22 +565,28 @@ define amdgpu_kernel void @local_system_acquire_load( ; ; GFX90A-TGSPLIT-LABEL: local_system_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_system_acquire_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -491,22 +596,28 @@ define amdgpu_kernel void @local_system_acquire_load( ; ; GFX942-TGSPLIT-LABEL: local_system_acquire_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v0 ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 -; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_system_acquire_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -517,9 +628,12 @@ define amdgpu_kernel void @local_system_acquire_load( ; ; GFX11-CU-LABEL: local_system_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -529,25 +643,33 @@ define amdgpu_kernel void @local_system_acquire_load( ; ; GFX12-WGP-LABEL: local_system_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: ds_load_b32 v1, v0 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: ds_store_b32 v0, v1 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_system_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm @@ -555,12 +677,16 @@ define amdgpu_kernel void @local_system_acquire_load( ; GFX1250-LABEL: local_system_acquire_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: ds_load_b32 v1, v0 ; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: ds_store_b32 v0, v1 ; GFX1250-NEXT: s_endpgm @@ -591,10 +717,13 @@ define amdgpu_kernel void @local_system_seq_cst_load( ; ; GFX7-LABEL: local_system_seq_cst_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_read_b32 v1, v0 @@ -606,9 +735,12 @@ define amdgpu_kernel void @local_system_seq_cst_load( ; ; GFX10-WGP-LABEL: local_system_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -621,9 +753,12 @@ define amdgpu_kernel void @local_system_seq_cst_load( ; ; GFX10-CU-LABEL: local_system_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -635,10 +770,13 @@ define amdgpu_kernel void @local_system_seq_cst_load( ; ; SKIP-CACHE-INV-LABEL: local_system_seq_cst_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 @@ -650,9 +788,12 @@ define amdgpu_kernel void @local_system_seq_cst_load( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -663,9 +804,12 @@ define amdgpu_kernel void @local_system_seq_cst_load( ; ; GFX90A-TGSPLIT-LABEL: local_system_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -677,9 +821,12 @@ define amdgpu_kernel void @local_system_seq_cst_load( ; ; GFX942-NOTTGSPLIT-LABEL: local_system_seq_cst_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -690,9 +837,12 @@ define amdgpu_kernel void @local_system_seq_cst_load( ; ; GFX942-TGSPLIT-LABEL: local_system_seq_cst_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -704,9 +854,12 @@ define amdgpu_kernel void @local_system_seq_cst_load( ; ; GFX11-WGP-LABEL: local_system_seq_cst_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -719,9 +872,12 @@ define amdgpu_kernel void @local_system_seq_cst_load( ; ; GFX11-CU-LABEL: local_system_seq_cst_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -733,9 +889,12 @@ define amdgpu_kernel void @local_system_seq_cst_load( ; ; GFX12-WGP-LABEL: local_system_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 @@ -744,15 +903,19 @@ define amdgpu_kernel void @local_system_seq_cst_load( ; GFX12-WGP-NEXT: ds_load_b32 v1, v0 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: ds_store_b32 v0, v1 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_system_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 @@ -760,6 +923,7 @@ define amdgpu_kernel void @local_system_seq_cst_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm @@ -767,14 +931,18 @@ define amdgpu_kernel void @local_system_seq_cst_load( ; GFX1250-LABEL: local_system_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_load_b32 v1, v0 ; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: ds_store_b32 v0, v1 ; GFX1250-NEXT: s_endpgm @@ -788,12 +956,14 @@ entry: define amdgpu_kernel void @local_system_unordered_store( ; GFX6-LABEL: local_system_unordered_store: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm @@ -801,6 +971,10 @@ define amdgpu_kernel void @local_system_unordered_store( ; GFX7-LABEL: local_system_unordered_store: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -812,6 +986,10 @@ define amdgpu_kernel void @local_system_unordered_store( ; GFX10-WGP-LABEL: local_system_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 @@ -822,6 +1000,10 @@ define amdgpu_kernel void @local_system_unordered_store( ; GFX10-CU-LABEL: local_system_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 @@ -832,6 +1014,10 @@ define amdgpu_kernel void @local_system_unordered_store( ; SKIP-CACHE-INV-LABEL: local_system_unordered_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -843,6 +1029,10 @@ define amdgpu_kernel void @local_system_unordered_store( ; GFX90A-NOTTGSPLIT-LABEL: local_system_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -853,6 +1043,10 @@ define amdgpu_kernel void @local_system_unordered_store( ; GFX90A-TGSPLIT-LABEL: local_system_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -863,6 +1057,10 @@ define amdgpu_kernel void @local_system_unordered_store( ; GFX942-NOTTGSPLIT-LABEL: local_system_unordered_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -873,6 +1071,10 @@ define amdgpu_kernel void @local_system_unordered_store( ; GFX942-TGSPLIT-LABEL: local_system_unordered_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -883,6 +1085,10 @@ define amdgpu_kernel void @local_system_unordered_store( ; GFX11-WGP-LABEL: local_system_unordered_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -893,6 +1099,10 @@ define amdgpu_kernel void @local_system_unordered_store( ; GFX11-CU-LABEL: local_system_unordered_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -903,6 +1113,10 @@ define amdgpu_kernel void @local_system_unordered_store( ; GFX12-WGP-LABEL: local_system_unordered_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -913,6 +1127,10 @@ define amdgpu_kernel void @local_system_unordered_store( ; GFX12-CU-LABEL: local_system_unordered_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -924,6 +1142,10 @@ define amdgpu_kernel void @local_system_unordered_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 @@ -939,12 +1161,14 @@ entry: define amdgpu_kernel void @local_system_monotonic_store( ; GFX6-LABEL: local_system_monotonic_store: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm @@ -952,6 +1176,10 @@ define amdgpu_kernel void @local_system_monotonic_store( ; GFX7-LABEL: local_system_monotonic_store: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -963,6 +1191,10 @@ define amdgpu_kernel void @local_system_monotonic_store( ; GFX10-WGP-LABEL: local_system_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 @@ -973,6 +1205,10 @@ define amdgpu_kernel void @local_system_monotonic_store( ; GFX10-CU-LABEL: local_system_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 @@ -983,6 +1219,10 @@ define amdgpu_kernel void @local_system_monotonic_store( ; SKIP-CACHE-INV-LABEL: local_system_monotonic_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -994,6 +1234,10 @@ define amdgpu_kernel void @local_system_monotonic_store( ; GFX90A-NOTTGSPLIT-LABEL: local_system_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -1004,6 +1248,10 @@ define amdgpu_kernel void @local_system_monotonic_store( ; GFX90A-TGSPLIT-LABEL: local_system_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -1014,6 +1262,10 @@ define amdgpu_kernel void @local_system_monotonic_store( ; GFX942-NOTTGSPLIT-LABEL: local_system_monotonic_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -1024,6 +1276,10 @@ define amdgpu_kernel void @local_system_monotonic_store( ; GFX942-TGSPLIT-LABEL: local_system_monotonic_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -1034,6 +1290,10 @@ define amdgpu_kernel void @local_system_monotonic_store( ; GFX11-WGP-LABEL: local_system_monotonic_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -1044,6 +1304,10 @@ define amdgpu_kernel void @local_system_monotonic_store( ; GFX11-CU-LABEL: local_system_monotonic_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -1054,6 +1318,10 @@ define amdgpu_kernel void @local_system_monotonic_store( ; GFX12-WGP-LABEL: local_system_monotonic_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -1064,6 +1332,10 @@ define amdgpu_kernel void @local_system_monotonic_store( ; GFX12-CU-LABEL: local_system_monotonic_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -1075,6 +1347,10 @@ define amdgpu_kernel void @local_system_monotonic_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 @@ -1090,12 +1366,14 @@ entry: define amdgpu_kernel void @local_system_release_store( ; GFX6-LABEL: local_system_release_store: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -1104,6 +1382,10 @@ define amdgpu_kernel void @local_system_release_store( ; GFX7-LABEL: local_system_release_store: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1116,6 +1398,10 @@ define amdgpu_kernel void @local_system_release_store( ; GFX10-WGP-LABEL: local_system_release_store: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 @@ -1128,6 +1414,10 @@ define amdgpu_kernel void @local_system_release_store( ; GFX10-CU-LABEL: local_system_release_store: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 @@ -1140,6 +1430,10 @@ define amdgpu_kernel void @local_system_release_store( ; SKIP-CACHE-INV-LABEL: local_system_release_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -1152,6 +1446,10 @@ define amdgpu_kernel void @local_system_release_store( ; GFX90A-NOTTGSPLIT-LABEL: local_system_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -1163,6 +1461,10 @@ define amdgpu_kernel void @local_system_release_store( ; GFX90A-TGSPLIT-LABEL: local_system_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -1174,6 +1476,10 @@ define amdgpu_kernel void @local_system_release_store( ; GFX942-NOTTGSPLIT-LABEL: local_system_release_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -1185,6 +1491,10 @@ define amdgpu_kernel void @local_system_release_store( ; GFX942-TGSPLIT-LABEL: local_system_release_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -1196,6 +1506,10 @@ define amdgpu_kernel void @local_system_release_store( ; GFX11-WGP-LABEL: local_system_release_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -1208,6 +1522,10 @@ define amdgpu_kernel void @local_system_release_store( ; GFX11-CU-LABEL: local_system_release_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -1220,6 +1538,10 @@ define amdgpu_kernel void @local_system_release_store( ; GFX12-WGP-LABEL: local_system_release_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -1234,6 +1556,10 @@ define amdgpu_kernel void @local_system_release_store( ; GFX12-CU-LABEL: local_system_release_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -1249,6 +1575,10 @@ define amdgpu_kernel void @local_system_release_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 @@ -1266,12 +1596,14 @@ entry: define amdgpu_kernel void @local_system_seq_cst_store( ; GFX6-LABEL: local_system_seq_cst_store: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -1280,6 +1612,10 @@ define amdgpu_kernel void @local_system_seq_cst_store( ; GFX7-LABEL: local_system_seq_cst_store: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1292,6 +1628,10 @@ define amdgpu_kernel void @local_system_seq_cst_store( ; GFX10-WGP-LABEL: local_system_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 @@ -1304,6 +1644,10 @@ define amdgpu_kernel void @local_system_seq_cst_store( ; GFX10-CU-LABEL: local_system_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 @@ -1316,6 +1660,10 @@ define amdgpu_kernel void @local_system_seq_cst_store( ; SKIP-CACHE-INV-LABEL: local_system_seq_cst_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -1328,6 +1676,10 @@ define amdgpu_kernel void @local_system_seq_cst_store( ; GFX90A-NOTTGSPLIT-LABEL: local_system_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -1339,6 +1691,10 @@ define amdgpu_kernel void @local_system_seq_cst_store( ; GFX90A-TGSPLIT-LABEL: local_system_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -1350,6 +1706,10 @@ define amdgpu_kernel void @local_system_seq_cst_store( ; GFX942-NOTTGSPLIT-LABEL: local_system_seq_cst_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -1361,6 +1721,10 @@ define amdgpu_kernel void @local_system_seq_cst_store( ; GFX942-TGSPLIT-LABEL: local_system_seq_cst_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -1372,6 +1736,10 @@ define amdgpu_kernel void @local_system_seq_cst_store( ; GFX11-WGP-LABEL: local_system_seq_cst_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -1384,6 +1752,10 @@ define amdgpu_kernel void @local_system_seq_cst_store( ; GFX11-CU-LABEL: local_system_seq_cst_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -1396,6 +1768,10 @@ define amdgpu_kernel void @local_system_seq_cst_store( ; GFX12-WGP-LABEL: local_system_seq_cst_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -1410,6 +1786,10 @@ define amdgpu_kernel void @local_system_seq_cst_store( ; GFX12-CU-LABEL: local_system_seq_cst_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -1425,6 +1805,10 @@ define amdgpu_kernel void @local_system_seq_cst_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 @@ -1443,133 +1827,183 @@ define amdgpu_kernel void @local_system_monotonic_atomicrmw( ; GFX6-LABEL: local_system_monotonic_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_system_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_system_monotonic_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_system_monotonic_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_monotonic_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_system_monotonic_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_system_monotonic_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_system_monotonic_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_system_monotonic_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm @@ -1577,10 +2011,14 @@ define amdgpu_kernel void @local_system_monotonic_atomicrmw( ; GFX1250-LABEL: local_system_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s0 ; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX1250-NEXT: s_endpgm @@ -1594,11 +2032,13 @@ define amdgpu_kernel void @local_system_acquire_atomicrmw( ; GFX6-LABEL: local_system_acquire_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -1606,11 +2046,15 @@ define amdgpu_kernel void @local_system_acquire_atomicrmw( ; ; GFX7-LABEL: local_system_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1618,10 +2062,14 @@ define amdgpu_kernel void @local_system_acquire_atomicrmw( ; ; GFX10-WGP-LABEL: local_system_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1630,10 +2078,14 @@ define amdgpu_kernel void @local_system_acquire_atomicrmw( ; ; GFX10-CU-LABEL: local_system_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1641,11 +2093,15 @@ define amdgpu_kernel void @local_system_acquire_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_system_acquire_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -1653,10 +2109,14 @@ define amdgpu_kernel void @local_system_acquire_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1664,10 +2124,14 @@ define amdgpu_kernel void @local_system_acquire_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_system_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol @@ -1675,10 +2139,14 @@ define amdgpu_kernel void @local_system_acquire_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: local_system_acquire_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1686,10 +2154,14 @@ define amdgpu_kernel void @local_system_acquire_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: local_system_acquire_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 @@ -1697,10 +2169,14 @@ define amdgpu_kernel void @local_system_acquire_atomicrmw( ; ; GFX11-WGP-LABEL: local_system_acquire_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1709,10 +2185,14 @@ define amdgpu_kernel void @local_system_acquire_atomicrmw( ; ; GFX11-CU-LABEL: local_system_acquire_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1720,10 +2200,14 @@ define amdgpu_kernel void @local_system_acquire_atomicrmw( ; ; GFX12-WGP-LABEL: local_system_acquire_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0 @@ -1732,10 +2216,14 @@ define amdgpu_kernel void @local_system_acquire_atomicrmw( ; ; GFX12-CU-LABEL: local_system_acquire_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 @@ -1744,10 +2232,14 @@ define amdgpu_kernel void @local_system_acquire_atomicrmw( ; GFX1250-LABEL: local_system_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s0 ; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -1762,11 +2254,13 @@ define amdgpu_kernel void @local_system_release_atomicrmw( ; GFX6-LABEL: local_system_release_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 @@ -1774,11 +2268,15 @@ define amdgpu_kernel void @local_system_release_atomicrmw( ; ; GFX7-LABEL: local_system_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 @@ -1786,10 +2284,14 @@ define amdgpu_kernel void @local_system_release_atomicrmw( ; ; GFX10-WGP-LABEL: local_system_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1798,10 +2300,14 @@ define amdgpu_kernel void @local_system_release_atomicrmw( ; ; GFX10-CU-LABEL: local_system_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1810,11 +2316,15 @@ define amdgpu_kernel void @local_system_release_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_system_release_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 @@ -1822,10 +2332,14 @@ define amdgpu_kernel void @local_system_release_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 @@ -1833,10 +2347,14 @@ define amdgpu_kernel void @local_system_release_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_system_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 @@ -1844,10 +2362,14 @@ define amdgpu_kernel void @local_system_release_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: local_system_release_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 @@ -1855,10 +2377,14 @@ define amdgpu_kernel void @local_system_release_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: local_system_release_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 @@ -1866,10 +2392,14 @@ define amdgpu_kernel void @local_system_release_atomicrmw( ; ; GFX11-WGP-LABEL: local_system_release_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1878,10 +2408,14 @@ define amdgpu_kernel void @local_system_release_atomicrmw( ; ; GFX11-CU-LABEL: local_system_release_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1890,10 +2424,14 @@ define amdgpu_kernel void @local_system_release_atomicrmw( ; ; GFX12-WGP-LABEL: local_system_release_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 @@ -1904,10 +2442,14 @@ define amdgpu_kernel void @local_system_release_atomicrmw( ; ; GFX12-CU-LABEL: local_system_release_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 @@ -1919,10 +2461,14 @@ define amdgpu_kernel void @local_system_release_atomicrmw( ; GFX1250-LABEL: local_system_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -1938,11 +2484,13 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw( ; GFX6-LABEL: local_system_acq_rel_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 @@ -1951,11 +2499,15 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw( ; ; GFX7-LABEL: local_system_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 @@ -1964,10 +2516,14 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw( ; ; GFX10-WGP-LABEL: local_system_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1978,10 +2534,14 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw( ; ; GFX10-CU-LABEL: local_system_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1991,11 +2551,15 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_system_acq_rel_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 @@ -2004,10 +2568,14 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 @@ -2016,10 +2584,14 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_system_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 @@ -2028,10 +2600,14 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: local_system_acq_rel_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 @@ -2040,10 +2616,14 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: local_system_acq_rel_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 @@ -2052,10 +2632,14 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw( ; ; GFX11-WGP-LABEL: local_system_acq_rel_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2066,10 +2650,14 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw( ; ; GFX11-CU-LABEL: local_system_acq_rel_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2079,10 +2667,14 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw( ; ; GFX12-WGP-LABEL: local_system_acq_rel_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 @@ -2095,10 +2687,14 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw( ; ; GFX12-CU-LABEL: local_system_acq_rel_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 @@ -2111,10 +2707,14 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw( ; GFX1250-LABEL: local_system_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -2131,11 +2731,13 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw( ; GFX6-LABEL: local_system_seq_cst_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 @@ -2144,11 +2746,15 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw( ; ; GFX7-LABEL: local_system_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 @@ -2157,10 +2763,14 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw( ; ; GFX10-WGP-LABEL: local_system_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2171,10 +2781,14 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw( ; ; GFX10-CU-LABEL: local_system_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2184,11 +2798,15 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_system_seq_cst_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 @@ -2197,10 +2815,14 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 @@ -2209,10 +2831,14 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_system_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 @@ -2221,10 +2847,14 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: local_system_seq_cst_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 @@ -2233,10 +2863,14 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: local_system_seq_cst_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 @@ -2245,10 +2879,14 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw( ; ; GFX11-WGP-LABEL: local_system_seq_cst_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2259,10 +2897,14 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw( ; ; GFX11-CU-LABEL: local_system_seq_cst_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2272,10 +2914,14 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw( ; ; GFX12-WGP-LABEL: local_system_seq_cst_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 @@ -2288,10 +2934,14 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw( ; ; GFX12-CU-LABEL: local_system_seq_cst_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 @@ -2304,10 +2954,14 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw( ; GFX1250-LABEL: local_system_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -2324,11 +2978,13 @@ define amdgpu_kernel void @local_system_acquire_ret_atomicrmw( ; GFX6-LABEL: local_system_acquire_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -2340,6 +2996,10 @@ define amdgpu_kernel void @local_system_acquire_ret_atomicrmw( ; GFX7-LABEL: local_system_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2355,6 +3015,10 @@ define amdgpu_kernel void @local_system_acquire_ret_atomicrmw( ; GFX10-WGP-LABEL: local_system_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -2369,6 +3033,10 @@ define amdgpu_kernel void @local_system_acquire_ret_atomicrmw( ; GFX10-CU-LABEL: local_system_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -2382,6 +3050,10 @@ define amdgpu_kernel void @local_system_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: local_system_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -2397,6 +3069,10 @@ define amdgpu_kernel void @local_system_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: local_system_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -2410,6 +3086,10 @@ define amdgpu_kernel void @local_system_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-LABEL: local_system_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -2424,6 +3104,10 @@ define amdgpu_kernel void @local_system_acquire_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: local_system_acquire_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 @@ -2437,6 +3121,10 @@ define amdgpu_kernel void @local_system_acquire_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: local_system_acquire_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 @@ -2451,6 +3139,10 @@ define amdgpu_kernel void @local_system_acquire_ret_atomicrmw( ; GFX11-WGP-LABEL: local_system_acquire_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -2465,6 +3157,10 @@ define amdgpu_kernel void @local_system_acquire_ret_atomicrmw( ; GFX11-CU-LABEL: local_system_acquire_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2478,6 +3174,10 @@ define amdgpu_kernel void @local_system_acquire_ret_atomicrmw( ; GFX12-WGP-LABEL: local_system_acquire_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -2492,6 +3192,10 @@ define amdgpu_kernel void @local_system_acquire_ret_atomicrmw( ; GFX12-CU-LABEL: local_system_acquire_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2506,6 +3210,10 @@ define amdgpu_kernel void @local_system_acquire_ret_atomicrmw( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 @@ -2526,11 +3234,13 @@ define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw( ; GFX6-LABEL: local_system_acq_rel_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 @@ -2543,6 +3253,10 @@ define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw( ; GFX7-LABEL: local_system_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2559,6 +3273,10 @@ define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw( ; GFX10-WGP-LABEL: local_system_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -2575,6 +3293,10 @@ define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw( ; GFX10-CU-LABEL: local_system_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -2590,6 +3312,10 @@ define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: local_system_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -2606,6 +3332,10 @@ define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: local_system_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -2620,6 +3350,10 @@ define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-LABEL: local_system_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -2635,6 +3369,10 @@ define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: local_system_acq_rel_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 @@ -2649,6 +3387,10 @@ define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: local_system_acq_rel_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 @@ -2664,6 +3406,10 @@ define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw( ; GFX11-WGP-LABEL: local_system_acq_rel_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -2680,6 +3426,10 @@ define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw( ; GFX11-CU-LABEL: local_system_acq_rel_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2695,6 +3445,10 @@ define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw( ; GFX12-WGP-LABEL: local_system_acq_rel_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -2713,6 +3467,10 @@ define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw( ; GFX12-CU-LABEL: local_system_acq_rel_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2731,6 +3489,10 @@ define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 @@ -2753,11 +3515,13 @@ define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw( ; GFX6-LABEL: local_system_seq_cst_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 @@ -2770,6 +3534,10 @@ define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw( ; GFX7-LABEL: local_system_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2786,6 +3554,10 @@ define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw( ; GFX10-WGP-LABEL: local_system_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -2802,6 +3574,10 @@ define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw( ; GFX10-CU-LABEL: local_system_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -2817,6 +3593,10 @@ define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: local_system_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -2833,6 +3613,10 @@ define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: local_system_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -2847,6 +3631,10 @@ define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-LABEL: local_system_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -2862,6 +3650,10 @@ define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: local_system_seq_cst_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 @@ -2876,6 +3668,10 @@ define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: local_system_seq_cst_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 @@ -2891,6 +3687,10 @@ define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw( ; GFX11-WGP-LABEL: local_system_seq_cst_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -2907,6 +3707,10 @@ define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw( ; GFX11-CU-LABEL: local_system_seq_cst_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2922,6 +3726,10 @@ define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw( ; GFX12-WGP-LABEL: local_system_seq_cst_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -2940,6 +3748,10 @@ define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw( ; GFX12-CU-LABEL: local_system_seq_cst_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2958,6 +3770,10 @@ define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 @@ -2980,12 +3796,16 @@ define amdgpu_kernel void @local_system_monotonic_monotonic_cmpxchg( ; GFX6-LABEL: local_system_monotonic_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -2993,12 +3813,18 @@ define amdgpu_kernel void @local_system_monotonic_monotonic_cmpxchg( ; ; GFX7-LABEL: local_system_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3006,11 +3832,17 @@ define amdgpu_kernel void @local_system_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3018,11 +3850,17 @@ define amdgpu_kernel void @local_system_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_system_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3030,12 +3868,18 @@ define amdgpu_kernel void @local_system_monotonic_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3043,11 +3887,17 @@ define amdgpu_kernel void @local_system_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3055,11 +3905,17 @@ define amdgpu_kernel void @local_system_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3067,11 +3923,17 @@ define amdgpu_kernel void @local_system_monotonic_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_system_monotonic_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3079,11 +3941,17 @@ define amdgpu_kernel void @local_system_monotonic_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_system_monotonic_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3091,48 +3959,72 @@ define amdgpu_kernel void @local_system_monotonic_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_system_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_system_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_system_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -3140,12 +4032,18 @@ define amdgpu_kernel void @local_system_monotonic_monotonic_cmpxchg( ; GFX1250-LABEL: local_system_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -3160,12 +4058,16 @@ define amdgpu_kernel void @local_system_acquire_monotonic_cmpxchg( ; GFX6-LABEL: local_system_acquire_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3174,12 +4076,18 @@ define amdgpu_kernel void @local_system_acquire_monotonic_cmpxchg( ; ; GFX7-LABEL: local_system_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3188,11 +4096,17 @@ define amdgpu_kernel void @local_system_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3202,11 +4116,17 @@ define amdgpu_kernel void @local_system_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_system_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3215,12 +4135,18 @@ define amdgpu_kernel void @local_system_acquire_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3229,11 +4155,17 @@ define amdgpu_kernel void @local_system_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3242,11 +4174,17 @@ define amdgpu_kernel void @local_system_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3255,11 +4193,17 @@ define amdgpu_kernel void @local_system_acquire_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_system_acquire_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3268,11 +4212,17 @@ define amdgpu_kernel void @local_system_acquire_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_system_acquire_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3281,12 +4231,18 @@ define amdgpu_kernel void @local_system_acquire_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -3295,12 +4251,18 @@ define amdgpu_kernel void @local_system_acquire_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: local_system_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -3308,12 +4270,18 @@ define amdgpu_kernel void @local_system_acquire_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: local_system_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0 @@ -3322,12 +4290,18 @@ define amdgpu_kernel void @local_system_acquire_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: local_system_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 @@ -3336,12 +4310,18 @@ define amdgpu_kernel void @local_system_acquire_monotonic_cmpxchg( ; GFX1250-LABEL: local_system_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -3357,12 +4337,16 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg( ; GFX6-LABEL: local_system_release_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -3371,12 +4355,18 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg( ; ; GFX7-LABEL: local_system_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -3385,11 +4375,17 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3399,11 +4395,17 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_system_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3413,12 +4415,18 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -3427,11 +4435,17 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3440,11 +4454,17 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3453,11 +4473,17 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_system_release_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3466,11 +4492,17 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_system_release_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3479,12 +4511,18 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3493,12 +4531,18 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: local_system_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3507,12 +4551,18 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: local_system_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 @@ -3523,12 +4573,18 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: local_system_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 @@ -3540,12 +4596,18 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg( ; GFX1250-LABEL: local_system_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -3562,12 +4624,16 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg( ; GFX6-LABEL: local_system_acq_rel_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -3577,12 +4643,18 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg( ; ; GFX7-LABEL: local_system_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -3592,11 +4664,17 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3608,11 +4686,17 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_system_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3623,12 +4707,18 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -3638,11 +4728,17 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3652,11 +4748,17 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3666,11 +4768,17 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_system_acq_rel_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3680,11 +4788,17 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_system_acq_rel_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3694,12 +4808,18 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3710,12 +4830,18 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: local_system_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3725,12 +4851,18 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: local_system_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 @@ -3743,12 +4875,18 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: local_system_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 @@ -3761,12 +4899,18 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg( ; GFX1250-LABEL: local_system_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -3784,12 +4928,16 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg( ; GFX6-LABEL: local_system_seq_cst_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -3799,12 +4947,18 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg( ; ; GFX7-LABEL: local_system_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -3814,11 +4968,17 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3830,11 +4990,17 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_system_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3845,12 +5011,18 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -3860,11 +5032,17 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3874,11 +5052,17 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3888,11 +5072,17 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_system_seq_cst_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3902,11 +5092,17 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_system_seq_cst_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3916,12 +5112,18 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3932,12 +5134,18 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: local_system_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3947,12 +5155,18 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: local_system_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 @@ -3965,12 +5179,18 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: local_system_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 @@ -3983,12 +5203,18 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg( ; GFX1250-LABEL: local_system_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -4006,12 +5232,16 @@ define amdgpu_kernel void @local_system_monotonic_acquire_cmpxchg( ; GFX6-LABEL: local_system_monotonic_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4020,12 +5250,18 @@ define amdgpu_kernel void @local_system_monotonic_acquire_cmpxchg( ; ; GFX7-LABEL: local_system_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4034,11 +5270,17 @@ define amdgpu_kernel void @local_system_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4048,11 +5290,17 @@ define amdgpu_kernel void @local_system_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_system_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4061,12 +5309,18 @@ define amdgpu_kernel void @local_system_monotonic_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4075,11 +5329,17 @@ define amdgpu_kernel void @local_system_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4088,11 +5348,17 @@ define amdgpu_kernel void @local_system_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4101,11 +5367,17 @@ define amdgpu_kernel void @local_system_monotonic_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_system_monotonic_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4114,11 +5386,17 @@ define amdgpu_kernel void @local_system_monotonic_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_system_monotonic_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4127,12 +5405,18 @@ define amdgpu_kernel void @local_system_monotonic_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4141,12 +5425,18 @@ define amdgpu_kernel void @local_system_monotonic_acquire_cmpxchg( ; ; GFX11-CU-LABEL: local_system_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4154,12 +5444,18 @@ define amdgpu_kernel void @local_system_monotonic_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: local_system_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0 @@ -4168,12 +5464,18 @@ define amdgpu_kernel void @local_system_monotonic_acquire_cmpxchg( ; ; GFX12-CU-LABEL: local_system_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 @@ -4182,12 +5484,18 @@ define amdgpu_kernel void @local_system_monotonic_acquire_cmpxchg( ; GFX1250-LABEL: local_system_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -4203,12 +5511,16 @@ define amdgpu_kernel void @local_system_acquire_acquire_cmpxchg( ; GFX6-LABEL: local_system_acquire_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4217,12 +5529,18 @@ define amdgpu_kernel void @local_system_acquire_acquire_cmpxchg( ; ; GFX7-LABEL: local_system_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4231,11 +5549,17 @@ define amdgpu_kernel void @local_system_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4245,11 +5569,17 @@ define amdgpu_kernel void @local_system_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_system_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4258,12 +5588,18 @@ define amdgpu_kernel void @local_system_acquire_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4272,11 +5608,17 @@ define amdgpu_kernel void @local_system_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4285,11 +5627,17 @@ define amdgpu_kernel void @local_system_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4298,11 +5646,17 @@ define amdgpu_kernel void @local_system_acquire_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_system_acquire_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4311,11 +5665,17 @@ define amdgpu_kernel void @local_system_acquire_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_system_acquire_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4324,12 +5684,18 @@ define amdgpu_kernel void @local_system_acquire_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4338,12 +5704,18 @@ define amdgpu_kernel void @local_system_acquire_acquire_cmpxchg( ; ; GFX11-CU-LABEL: local_system_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4351,12 +5723,18 @@ define amdgpu_kernel void @local_system_acquire_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: local_system_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0 @@ -4365,12 +5743,18 @@ define amdgpu_kernel void @local_system_acquire_acquire_cmpxchg( ; ; GFX12-CU-LABEL: local_system_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 @@ -4379,12 +5763,18 @@ define amdgpu_kernel void @local_system_acquire_acquire_cmpxchg( ; GFX1250-LABEL: local_system_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -4400,12 +5790,16 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg( ; GFX6-LABEL: local_system_release_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -4415,12 +5809,18 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg( ; ; GFX7-LABEL: local_system_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -4430,11 +5830,17 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4446,11 +5852,17 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_system_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4461,12 +5873,18 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -4476,11 +5894,17 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4490,11 +5914,17 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4504,11 +5934,17 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_system_release_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4518,11 +5954,17 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_system_release_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4532,12 +5974,18 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4548,12 +5996,18 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg( ; ; GFX11-CU-LABEL: local_system_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4563,12 +6017,18 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: local_system_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 @@ -4581,12 +6041,18 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg( ; ; GFX12-CU-LABEL: local_system_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 @@ -4599,12 +6065,18 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg( ; GFX1250-LABEL: local_system_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -4622,12 +6094,16 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg( ; GFX6-LABEL: local_system_acq_rel_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -4637,12 +6113,18 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg( ; ; GFX7-LABEL: local_system_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -4652,11 +6134,17 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4668,11 +6156,17 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_system_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4683,12 +6177,18 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -4698,11 +6198,17 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4712,11 +6218,17 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4726,11 +6238,17 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_system_acq_rel_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4740,11 +6258,17 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_system_acq_rel_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4754,12 +6278,18 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4770,12 +6300,18 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg( ; ; GFX11-CU-LABEL: local_system_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4785,12 +6321,18 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: local_system_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 @@ -4803,12 +6345,18 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg( ; ; GFX12-CU-LABEL: local_system_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 @@ -4821,12 +6369,18 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg( ; GFX1250-LABEL: local_system_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -4844,12 +6398,16 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg( ; GFX6-LABEL: local_system_seq_cst_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -4859,12 +6417,18 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg( ; ; GFX7-LABEL: local_system_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -4874,11 +6438,17 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4890,11 +6460,17 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_system_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4905,12 +6481,18 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -4920,11 +6502,17 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4934,11 +6522,17 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4948,11 +6542,17 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_system_seq_cst_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4962,11 +6562,17 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_system_seq_cst_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4976,12 +6582,18 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4992,12 +6604,18 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg( ; ; GFX11-CU-LABEL: local_system_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -5007,12 +6625,18 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: local_system_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 @@ -5025,12 +6649,18 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg( ; ; GFX12-CU-LABEL: local_system_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 @@ -5043,12 +6673,18 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg( ; GFX1250-LABEL: local_system_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -5066,12 +6702,16 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg( ; GFX6-LABEL: local_system_monotonic_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5081,12 +6721,18 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_system_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -5096,11 +6742,17 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5112,11 +6764,17 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_system_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5127,12 +6785,18 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -5142,11 +6806,17 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5156,11 +6826,17 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5170,11 +6846,17 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_system_monotonic_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5184,11 +6866,17 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_system_monotonic_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5198,12 +6886,18 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -5214,12 +6908,18 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: local_system_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -5229,12 +6929,18 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: local_system_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 @@ -5247,12 +6953,18 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: local_system_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 @@ -5265,12 +6977,18 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg( ; GFX1250-LABEL: local_system_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -5288,12 +7006,16 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg( ; GFX6-LABEL: local_system_acquire_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5303,12 +7025,18 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_system_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -5318,11 +7046,17 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5334,11 +7068,17 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_system_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5349,12 +7089,18 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -5364,11 +7110,17 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5378,11 +7130,17 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5392,11 +7150,17 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_system_acquire_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5406,11 +7170,17 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_system_acquire_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5420,12 +7190,18 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -5436,12 +7212,18 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: local_system_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -5451,12 +7233,18 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: local_system_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 @@ -5469,12 +7257,18 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: local_system_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 @@ -5487,12 +7281,18 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg( ; GFX1250-LABEL: local_system_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -5510,12 +7310,16 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg( ; GFX6-LABEL: local_system_release_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5525,12 +7329,18 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_system_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -5540,11 +7350,17 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5556,11 +7372,17 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_system_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5571,12 +7393,18 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -5586,11 +7414,17 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5600,11 +7434,17 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5614,11 +7454,17 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_system_release_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5628,11 +7474,17 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_system_release_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5642,12 +7494,18 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -5658,12 +7516,18 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: local_system_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -5673,12 +7537,18 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: local_system_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 @@ -5691,12 +7561,18 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: local_system_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 @@ -5709,12 +7585,18 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg( ; GFX1250-LABEL: local_system_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -5732,12 +7614,16 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg( ; GFX6-LABEL: local_system_acq_rel_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5747,12 +7633,18 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_system_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -5762,11 +7654,17 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5778,11 +7676,17 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_system_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5793,12 +7697,18 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -5808,11 +7718,17 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5822,11 +7738,17 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5836,11 +7758,17 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_system_acq_rel_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5850,11 +7778,17 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_system_acq_rel_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5864,12 +7798,18 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -5880,12 +7820,18 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: local_system_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -5895,12 +7841,18 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: local_system_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 @@ -5913,12 +7865,18 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: local_system_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 @@ -5931,12 +7889,18 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg( ; GFX1250-LABEL: local_system_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -5954,12 +7918,16 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg( ; GFX6-LABEL: local_system_seq_cst_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5969,12 +7937,18 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_system_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -5984,11 +7958,17 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6000,11 +7980,17 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_system_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6015,12 +8001,18 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -6030,11 +8022,17 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6044,11 +8042,17 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6058,11 +8062,17 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_system_seq_cst_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6072,11 +8082,17 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_system_seq_cst_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6086,12 +8102,18 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6102,12 +8124,18 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: local_system_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6117,12 +8145,18 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: local_system_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 @@ -6135,12 +8169,18 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: local_system_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 @@ -6153,12 +8193,18 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg( ; GFX1250-LABEL: local_system_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -6176,12 +8222,16 @@ define amdgpu_kernel void @local_system_monotonic_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_system_monotonic_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -6194,6 +8244,12 @@ define amdgpu_kernel void @local_system_monotonic_monotonic_ret_cmpxchg( ; GFX7-LABEL: local_system_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -6211,6 +8267,12 @@ define amdgpu_kernel void @local_system_monotonic_monotonic_ret_cmpxchg( ; GFX10-WGP-LABEL: local_system_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -6226,6 +8288,12 @@ define amdgpu_kernel void @local_system_monotonic_monotonic_ret_cmpxchg( ; GFX10-CU-LABEL: local_system_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6241,6 +8309,12 @@ define amdgpu_kernel void @local_system_monotonic_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_system_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -6258,6 +8332,12 @@ define amdgpu_kernel void @local_system_monotonic_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_system_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6273,6 +8353,12 @@ define amdgpu_kernel void @local_system_monotonic_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_system_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6288,6 +8374,12 @@ define amdgpu_kernel void @local_system_monotonic_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_system_monotonic_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6303,6 +8395,12 @@ define amdgpu_kernel void @local_system_monotonic_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_system_monotonic_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6318,6 +8416,12 @@ define amdgpu_kernel void @local_system_monotonic_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: local_system_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -6333,6 +8437,12 @@ define amdgpu_kernel void @local_system_monotonic_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: local_system_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6348,6 +8458,12 @@ define amdgpu_kernel void @local_system_monotonic_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_system_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -6363,6 +8479,12 @@ define amdgpu_kernel void @local_system_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_system_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -6379,6 +8501,12 @@ define amdgpu_kernel void @local_system_monotonic_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -6403,12 +8531,16 @@ define amdgpu_kernel void @local_system_acquire_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_system_acquire_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -6421,6 +8553,12 @@ define amdgpu_kernel void @local_system_acquire_monotonic_ret_cmpxchg( ; GFX7-LABEL: local_system_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -6438,6 +8576,12 @@ define amdgpu_kernel void @local_system_acquire_monotonic_ret_cmpxchg( ; GFX10-WGP-LABEL: local_system_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -6454,6 +8598,12 @@ define amdgpu_kernel void @local_system_acquire_monotonic_ret_cmpxchg( ; GFX10-CU-LABEL: local_system_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6469,6 +8619,12 @@ define amdgpu_kernel void @local_system_acquire_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_system_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -6486,6 +8642,12 @@ define amdgpu_kernel void @local_system_acquire_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_system_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6501,6 +8663,12 @@ define amdgpu_kernel void @local_system_acquire_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_system_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6517,6 +8685,12 @@ define amdgpu_kernel void @local_system_acquire_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_system_acquire_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6532,6 +8706,12 @@ define amdgpu_kernel void @local_system_acquire_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_system_acquire_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6548,6 +8728,12 @@ define amdgpu_kernel void @local_system_acquire_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: local_system_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -6564,6 +8750,12 @@ define amdgpu_kernel void @local_system_acquire_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: local_system_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6579,6 +8771,12 @@ define amdgpu_kernel void @local_system_acquire_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_system_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -6595,6 +8793,12 @@ define amdgpu_kernel void @local_system_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_system_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -6611,6 +8815,12 @@ define amdgpu_kernel void @local_system_acquire_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -6635,12 +8845,16 @@ define amdgpu_kernel void @local_system_release_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_system_release_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6654,6 +8868,12 @@ define amdgpu_kernel void @local_system_release_monotonic_ret_cmpxchg( ; GFX7-LABEL: local_system_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -6672,6 +8892,12 @@ define amdgpu_kernel void @local_system_release_monotonic_ret_cmpxchg( ; GFX10-WGP-LABEL: local_system_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -6689,6 +8915,12 @@ define amdgpu_kernel void @local_system_release_monotonic_ret_cmpxchg( ; GFX10-CU-LABEL: local_system_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6706,6 +8938,12 @@ define amdgpu_kernel void @local_system_release_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_system_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -6724,6 +8962,12 @@ define amdgpu_kernel void @local_system_release_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_system_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6740,6 +8984,12 @@ define amdgpu_kernel void @local_system_release_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_system_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6756,6 +9006,12 @@ define amdgpu_kernel void @local_system_release_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_system_release_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6772,6 +9028,12 @@ define amdgpu_kernel void @local_system_release_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_system_release_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6788,6 +9050,12 @@ define amdgpu_kernel void @local_system_release_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: local_system_release_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -6805,6 +9073,12 @@ define amdgpu_kernel void @local_system_release_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: local_system_release_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6822,6 +9096,12 @@ define amdgpu_kernel void @local_system_release_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_system_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -6841,6 +9121,12 @@ define amdgpu_kernel void @local_system_release_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_system_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -6861,6 +9147,12 @@ define amdgpu_kernel void @local_system_release_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -6887,12 +9179,16 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6906,6 +9202,12 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg( ; GFX7-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -6924,6 +9226,12 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg( ; GFX10-WGP-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -6942,6 +9250,12 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg( ; GFX10-CU-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6959,6 +9273,12 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -6977,6 +9297,12 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6993,6 +9319,12 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7010,6 +9342,12 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7026,6 +9364,12 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7043,6 +9387,12 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7061,6 +9411,12 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -7078,6 +9434,12 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -7098,6 +9460,12 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -7118,6 +9486,12 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -7144,12 +9518,16 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7163,6 +9541,12 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg( ; GFX7-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -7181,6 +9565,12 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg( ; GFX10-WGP-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7199,6 +9589,12 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg( ; GFX10-CU-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -7216,6 +9612,12 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -7234,6 +9636,12 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7250,6 +9658,12 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7267,6 +9681,12 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7283,6 +9703,12 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7300,6 +9726,12 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7318,6 +9750,12 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -7335,6 +9773,12 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -7355,6 +9799,12 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -7375,6 +9825,12 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -7401,12 +9857,16 @@ define amdgpu_kernel void @local_system_monotonic_acquire_ret_cmpxchg( ; GFX6-LABEL: local_system_monotonic_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -7419,6 +9879,12 @@ define amdgpu_kernel void @local_system_monotonic_acquire_ret_cmpxchg( ; GFX7-LABEL: local_system_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -7436,6 +9902,12 @@ define amdgpu_kernel void @local_system_monotonic_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: local_system_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7452,6 +9924,12 @@ define amdgpu_kernel void @local_system_monotonic_acquire_ret_cmpxchg( ; GFX10-CU-LABEL: local_system_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -7467,6 +9945,12 @@ define amdgpu_kernel void @local_system_monotonic_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_system_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -7484,6 +9968,12 @@ define amdgpu_kernel void @local_system_monotonic_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_system_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7499,6 +9989,12 @@ define amdgpu_kernel void @local_system_monotonic_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_system_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7515,6 +10011,12 @@ define amdgpu_kernel void @local_system_monotonic_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_system_monotonic_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7530,6 +10032,12 @@ define amdgpu_kernel void @local_system_monotonic_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_system_monotonic_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7546,6 +10054,12 @@ define amdgpu_kernel void @local_system_monotonic_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: local_system_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7562,6 +10076,12 @@ define amdgpu_kernel void @local_system_monotonic_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: local_system_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -7577,6 +10097,12 @@ define amdgpu_kernel void @local_system_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_system_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -7593,6 +10119,12 @@ define amdgpu_kernel void @local_system_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_system_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -7609,6 +10141,12 @@ define amdgpu_kernel void @local_system_monotonic_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -7633,12 +10171,16 @@ define amdgpu_kernel void @local_system_acquire_acquire_ret_cmpxchg( ; GFX6-LABEL: local_system_acquire_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -7651,6 +10193,12 @@ define amdgpu_kernel void @local_system_acquire_acquire_ret_cmpxchg( ; GFX7-LABEL: local_system_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -7668,6 +10216,12 @@ define amdgpu_kernel void @local_system_acquire_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: local_system_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7684,6 +10238,12 @@ define amdgpu_kernel void @local_system_acquire_acquire_ret_cmpxchg( ; GFX10-CU-LABEL: local_system_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -7699,6 +10259,12 @@ define amdgpu_kernel void @local_system_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_system_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -7716,6 +10282,12 @@ define amdgpu_kernel void @local_system_acquire_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_system_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7731,6 +10303,12 @@ define amdgpu_kernel void @local_system_acquire_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_system_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7747,6 +10325,12 @@ define amdgpu_kernel void @local_system_acquire_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_system_acquire_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7762,6 +10346,12 @@ define amdgpu_kernel void @local_system_acquire_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_system_acquire_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7778,6 +10368,12 @@ define amdgpu_kernel void @local_system_acquire_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: local_system_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7794,6 +10390,12 @@ define amdgpu_kernel void @local_system_acquire_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: local_system_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -7809,6 +10411,12 @@ define amdgpu_kernel void @local_system_acquire_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_system_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -7825,6 +10433,12 @@ define amdgpu_kernel void @local_system_acquire_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_system_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -7841,6 +10455,12 @@ define amdgpu_kernel void @local_system_acquire_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -7865,12 +10485,16 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg( ; GFX6-LABEL: local_system_release_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7884,6 +10508,12 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg( ; GFX7-LABEL: local_system_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -7902,6 +10532,12 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: local_system_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7920,6 +10556,12 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg( ; GFX10-CU-LABEL: local_system_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -7937,6 +10579,12 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_system_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -7955,6 +10603,12 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_system_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7971,6 +10625,12 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_system_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7988,6 +10648,12 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_system_release_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8004,6 +10670,12 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_system_release_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8021,6 +10693,12 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: local_system_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -8039,6 +10717,12 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: local_system_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -8056,6 +10740,12 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_system_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -8076,6 +10766,12 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_system_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -8096,6 +10792,12 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -8122,12 +10824,16 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg( ; GFX6-LABEL: local_system_acq_rel_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -8141,6 +10847,12 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg( ; GFX7-LABEL: local_system_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -8159,6 +10871,12 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: local_system_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -8177,6 +10895,12 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg( ; GFX10-CU-LABEL: local_system_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -8194,6 +10918,12 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_system_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -8212,6 +10942,12 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_system_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8228,6 +10964,12 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_system_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8245,6 +10987,12 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_system_acq_rel_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8261,6 +11009,12 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_system_acq_rel_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8278,6 +11032,12 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: local_system_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -8296,6 +11056,12 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: local_system_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -8313,6 +11079,12 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_system_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -8333,6 +11105,12 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_system_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -8353,6 +11131,12 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -8379,12 +11163,16 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg( ; GFX6-LABEL: local_system_seq_cst_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -8398,6 +11186,12 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg( ; GFX7-LABEL: local_system_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -8416,6 +11210,12 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: local_system_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -8434,6 +11234,12 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg( ; GFX10-CU-LABEL: local_system_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -8451,6 +11257,12 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_system_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -8469,6 +11281,12 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_system_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8485,6 +11303,12 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_system_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8502,6 +11326,12 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_system_seq_cst_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8518,6 +11348,12 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_system_seq_cst_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8535,6 +11371,12 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: local_system_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -8553,6 +11395,12 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: local_system_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -8570,6 +11418,12 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_system_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -8590,6 +11444,12 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_system_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -8610,6 +11470,12 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -8636,12 +11502,16 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_system_monotonic_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -8655,6 +11525,12 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg( ; GFX7-LABEL: local_system_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -8673,6 +11549,12 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg( ; GFX10-WGP-LABEL: local_system_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -8691,6 +11573,12 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg( ; GFX10-CU-LABEL: local_system_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -8708,6 +11596,12 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_system_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -8726,6 +11620,12 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_system_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8742,6 +11642,12 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_system_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8759,6 +11665,12 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_system_monotonic_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8775,6 +11687,12 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_system_monotonic_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8792,6 +11710,12 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: local_system_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -8810,6 +11734,12 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: local_system_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -8827,6 +11757,12 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_system_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -8847,6 +11783,12 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_system_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -8867,6 +11809,12 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -8893,12 +11841,16 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_system_acquire_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -8912,6 +11864,12 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg( ; GFX7-LABEL: local_system_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -8930,6 +11888,12 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg( ; GFX10-WGP-LABEL: local_system_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -8948,6 +11912,12 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg( ; GFX10-CU-LABEL: local_system_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -8965,6 +11935,12 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_system_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -8983,6 +11959,12 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_system_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8999,6 +11981,12 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_system_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9016,6 +12004,12 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_system_acquire_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9032,6 +12026,12 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_system_acquire_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9049,6 +12049,12 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: local_system_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -9067,6 +12073,12 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: local_system_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -9084,6 +12096,12 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_system_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -9104,6 +12122,12 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_system_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -9124,6 +12148,12 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -9150,12 +12180,16 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_system_release_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -9169,6 +12203,12 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg( ; GFX7-LABEL: local_system_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -9187,6 +12227,12 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg( ; GFX10-WGP-LABEL: local_system_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -9205,6 +12251,12 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg( ; GFX10-CU-LABEL: local_system_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -9222,6 +12274,12 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_system_release_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -9240,6 +12298,12 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_system_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9256,6 +12320,12 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_system_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9273,6 +12343,12 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_system_release_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9289,6 +12365,12 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_system_release_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9306,6 +12388,12 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: local_system_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -9324,6 +12412,12 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: local_system_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -9341,6 +12435,12 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_system_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -9361,6 +12461,12 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_system_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -9381,6 +12487,12 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -9407,12 +12519,16 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -9426,6 +12542,12 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-LABEL: local_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -9444,6 +12566,12 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-WGP-LABEL: local_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -9462,6 +12590,12 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-CU-LABEL: local_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -9479,6 +12613,12 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_system_acq_rel_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -9497,6 +12637,12 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9513,6 +12659,12 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9530,6 +12682,12 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9546,6 +12704,12 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9563,6 +12727,12 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: local_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -9581,6 +12751,12 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: local_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -9598,6 +12774,12 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -9618,6 +12800,12 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -9638,6 +12826,12 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -9664,12 +12858,16 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -9683,6 +12881,12 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -9701,6 +12905,12 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-WGP-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -9719,6 +12929,12 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-CU-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -9736,6 +12952,12 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -9754,6 +12976,12 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9770,6 +12998,12 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9787,6 +13021,12 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9803,6 +13043,12 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9820,6 +13066,12 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -9838,6 +13090,12 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -9855,6 +13113,12 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -9875,6 +13139,12 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -9895,6 +13165,12 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -9936,135 +13212,169 @@ define amdgpu_kernel void @local_system_one_as_unordered_load( ; ; GFX7-LABEL: local_system_one_as_unordered_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: ds_read_b32 v1, v0 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_one_as_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_system_one_as_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_unordered_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_system_one_as_unordered_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_one_as_unordered_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_system_one_as_unordered_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_system_one_as_unordered_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_system_one_as_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: ds_load_b32 v1, v0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: ds_store_b32 v0, v1 @@ -10072,11 +13382,15 @@ define amdgpu_kernel void @local_system_one_as_unordered_load( ; ; GFX12-CU-LABEL: local_system_one_as_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 @@ -10085,11 +13399,15 @@ define amdgpu_kernel void @local_system_one_as_unordered_load( ; GFX1250-LABEL: local_system_one_as_unordered_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: ds_store_b32 v0, v1 @@ -10120,135 +13438,169 @@ define amdgpu_kernel void @local_system_one_as_monotonic_load( ; ; GFX7-LABEL: local_system_one_as_monotonic_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: ds_read_b32 v1, v0 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_one_as_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_system_one_as_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_monotonic_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_one_as_monotonic_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_system_one_as_monotonic_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_system_one_as_monotonic_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_system_one_as_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: ds_load_b32 v1, v0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: ds_store_b32 v0, v1 @@ -10256,11 +13608,15 @@ define amdgpu_kernel void @local_system_one_as_monotonic_load( ; ; GFX12-CU-LABEL: local_system_one_as_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 @@ -10269,11 +13625,15 @@ define amdgpu_kernel void @local_system_one_as_monotonic_load( ; GFX1250-LABEL: local_system_one_as_monotonic_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: ds_store_b32 v0, v1 @@ -10304,135 +13664,169 @@ define amdgpu_kernel void @local_system_one_as_acquire_load( ; ; GFX7-LABEL: local_system_one_as_acquire_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: ds_read_b32 v1, v0 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_one_as_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_system_one_as_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_acquire_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_system_one_as_acquire_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_one_as_acquire_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_system_one_as_acquire_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_system_one_as_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_system_one_as_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: ds_load_b32 v1, v0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: ds_store_b32 v0, v1 @@ -10440,11 +13834,15 @@ define amdgpu_kernel void @local_system_one_as_acquire_load( ; ; GFX12-CU-LABEL: local_system_one_as_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 @@ -10453,11 +13851,15 @@ define amdgpu_kernel void @local_system_one_as_acquire_load( ; GFX1250-LABEL: local_system_one_as_acquire_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: ds_store_b32 v0, v1 @@ -10488,135 +13890,169 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_load( ; ; GFX7-LABEL: local_system_one_as_seq_cst_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: ds_read_b32 v1, v0 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_one_as_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_system_one_as_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_seq_cst_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_one_as_seq_cst_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_system_one_as_seq_cst_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_system_one_as_seq_cst_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_system_one_as_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: ds_load_b32 v1, v0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: ds_store_b32 v0, v1 @@ -10624,11 +14060,15 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_load( ; ; GFX12-CU-LABEL: local_system_one_as_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 @@ -10637,11 +14077,15 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_load( ; GFX1250-LABEL: local_system_one_as_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: ds_store_b32 v0, v1 @@ -10656,12 +14100,14 @@ entry: define amdgpu_kernel void @local_system_one_as_unordered_store( ; GFX6-LABEL: local_system_one_as_unordered_store: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm @@ -10669,6 +14115,10 @@ define amdgpu_kernel void @local_system_one_as_unordered_store( ; GFX7-LABEL: local_system_one_as_unordered_store: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10680,6 +14130,10 @@ define amdgpu_kernel void @local_system_one_as_unordered_store( ; GFX10-WGP-LABEL: local_system_one_as_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 @@ -10690,6 +14144,10 @@ define amdgpu_kernel void @local_system_one_as_unordered_store( ; GFX10-CU-LABEL: local_system_one_as_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 @@ -10700,6 +14158,10 @@ define amdgpu_kernel void @local_system_one_as_unordered_store( ; SKIP-CACHE-INV-LABEL: local_system_one_as_unordered_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -10711,6 +14173,10 @@ define amdgpu_kernel void @local_system_one_as_unordered_store( ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -10721,6 +14187,10 @@ define amdgpu_kernel void @local_system_one_as_unordered_store( ; GFX90A-TGSPLIT-LABEL: local_system_one_as_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -10731,6 +14201,10 @@ define amdgpu_kernel void @local_system_one_as_unordered_store( ; GFX942-NOTTGSPLIT-LABEL: local_system_one_as_unordered_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -10741,6 +14215,10 @@ define amdgpu_kernel void @local_system_one_as_unordered_store( ; GFX942-TGSPLIT-LABEL: local_system_one_as_unordered_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -10751,6 +14229,10 @@ define amdgpu_kernel void @local_system_one_as_unordered_store( ; GFX11-WGP-LABEL: local_system_one_as_unordered_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -10761,6 +14243,10 @@ define amdgpu_kernel void @local_system_one_as_unordered_store( ; GFX11-CU-LABEL: local_system_one_as_unordered_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -10771,6 +14257,10 @@ define amdgpu_kernel void @local_system_one_as_unordered_store( ; GFX12-WGP-LABEL: local_system_one_as_unordered_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -10781,6 +14271,10 @@ define amdgpu_kernel void @local_system_one_as_unordered_store( ; GFX12-CU-LABEL: local_system_one_as_unordered_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -10792,6 +14286,10 @@ define amdgpu_kernel void @local_system_one_as_unordered_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 @@ -10807,12 +14305,14 @@ entry: define amdgpu_kernel void @local_system_one_as_monotonic_store( ; GFX6-LABEL: local_system_one_as_monotonic_store: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm @@ -10820,6 +14320,10 @@ define amdgpu_kernel void @local_system_one_as_monotonic_store( ; GFX7-LABEL: local_system_one_as_monotonic_store: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10831,6 +14335,10 @@ define amdgpu_kernel void @local_system_one_as_monotonic_store( ; GFX10-WGP-LABEL: local_system_one_as_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 @@ -10841,6 +14349,10 @@ define amdgpu_kernel void @local_system_one_as_monotonic_store( ; GFX10-CU-LABEL: local_system_one_as_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 @@ -10851,6 +14363,10 @@ define amdgpu_kernel void @local_system_one_as_monotonic_store( ; SKIP-CACHE-INV-LABEL: local_system_one_as_monotonic_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -10862,6 +14378,10 @@ define amdgpu_kernel void @local_system_one_as_monotonic_store( ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -10872,6 +14392,10 @@ define amdgpu_kernel void @local_system_one_as_monotonic_store( ; GFX90A-TGSPLIT-LABEL: local_system_one_as_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -10882,6 +14406,10 @@ define amdgpu_kernel void @local_system_one_as_monotonic_store( ; GFX942-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -10892,6 +14420,10 @@ define amdgpu_kernel void @local_system_one_as_monotonic_store( ; GFX942-TGSPLIT-LABEL: local_system_one_as_monotonic_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -10902,6 +14434,10 @@ define amdgpu_kernel void @local_system_one_as_monotonic_store( ; GFX11-WGP-LABEL: local_system_one_as_monotonic_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -10912,6 +14448,10 @@ define amdgpu_kernel void @local_system_one_as_monotonic_store( ; GFX11-CU-LABEL: local_system_one_as_monotonic_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -10922,6 +14462,10 @@ define amdgpu_kernel void @local_system_one_as_monotonic_store( ; GFX12-WGP-LABEL: local_system_one_as_monotonic_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -10932,6 +14476,10 @@ define amdgpu_kernel void @local_system_one_as_monotonic_store( ; GFX12-CU-LABEL: local_system_one_as_monotonic_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -10943,6 +14491,10 @@ define amdgpu_kernel void @local_system_one_as_monotonic_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 @@ -10958,12 +14510,14 @@ entry: define amdgpu_kernel void @local_system_one_as_release_store( ; GFX6-LABEL: local_system_one_as_release_store: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm @@ -10971,6 +14525,10 @@ define amdgpu_kernel void @local_system_one_as_release_store( ; GFX7-LABEL: local_system_one_as_release_store: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10982,6 +14540,10 @@ define amdgpu_kernel void @local_system_one_as_release_store( ; GFX10-WGP-LABEL: local_system_one_as_release_store: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 @@ -10992,6 +14554,10 @@ define amdgpu_kernel void @local_system_one_as_release_store( ; GFX10-CU-LABEL: local_system_one_as_release_store: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 @@ -11002,6 +14568,10 @@ define amdgpu_kernel void @local_system_one_as_release_store( ; SKIP-CACHE-INV-LABEL: local_system_one_as_release_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -11013,6 +14583,10 @@ define amdgpu_kernel void @local_system_one_as_release_store( ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -11023,6 +14597,10 @@ define amdgpu_kernel void @local_system_one_as_release_store( ; GFX90A-TGSPLIT-LABEL: local_system_one_as_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -11033,6 +14611,10 @@ define amdgpu_kernel void @local_system_one_as_release_store( ; GFX942-NOTTGSPLIT-LABEL: local_system_one_as_release_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -11043,6 +14625,10 @@ define amdgpu_kernel void @local_system_one_as_release_store( ; GFX942-TGSPLIT-LABEL: local_system_one_as_release_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -11053,6 +14639,10 @@ define amdgpu_kernel void @local_system_one_as_release_store( ; GFX11-WGP-LABEL: local_system_one_as_release_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -11063,6 +14653,10 @@ define amdgpu_kernel void @local_system_one_as_release_store( ; GFX11-CU-LABEL: local_system_one_as_release_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -11073,6 +14667,10 @@ define amdgpu_kernel void @local_system_one_as_release_store( ; GFX12-WGP-LABEL: local_system_one_as_release_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -11083,6 +14681,10 @@ define amdgpu_kernel void @local_system_one_as_release_store( ; GFX12-CU-LABEL: local_system_one_as_release_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -11094,6 +14696,10 @@ define amdgpu_kernel void @local_system_one_as_release_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 @@ -11109,12 +14715,14 @@ entry: define amdgpu_kernel void @local_system_one_as_seq_cst_store( ; GFX6-LABEL: local_system_one_as_seq_cst_store: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm @@ -11122,6 +14730,10 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_store( ; GFX7-LABEL: local_system_one_as_seq_cst_store: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11133,6 +14745,10 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_store( ; GFX10-WGP-LABEL: local_system_one_as_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 @@ -11143,6 +14759,10 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_store( ; GFX10-CU-LABEL: local_system_one_as_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 @@ -11153,6 +14773,10 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_store( ; SKIP-CACHE-INV-LABEL: local_system_one_as_seq_cst_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -11164,6 +14788,10 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_store( ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -11174,6 +14802,10 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_store( ; GFX90A-TGSPLIT-LABEL: local_system_one_as_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -11184,6 +14816,10 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_store( ; GFX942-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -11194,6 +14830,10 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_store( ; GFX942-TGSPLIT-LABEL: local_system_one_as_seq_cst_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -11204,6 +14844,10 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_store( ; GFX11-WGP-LABEL: local_system_one_as_seq_cst_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -11214,6 +14858,10 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_store( ; GFX11-CU-LABEL: local_system_one_as_seq_cst_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -11224,6 +14872,10 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_store( ; GFX12-WGP-LABEL: local_system_one_as_seq_cst_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -11234,6 +14886,10 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_store( ; GFX12-CU-LABEL: local_system_one_as_seq_cst_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -11245,6 +14901,10 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 @@ -11261,133 +14921,183 @@ define amdgpu_kernel void @local_system_one_as_monotonic_atomicrmw( ; GFX6-LABEL: local_system_one_as_monotonic_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_one_as_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_one_as_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_system_one_as_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_monotonic_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_one_as_monotonic_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_system_one_as_monotonic_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_system_one_as_monotonic_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_system_one_as_monotonic_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_system_one_as_monotonic_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm @@ -11395,10 +15105,14 @@ define amdgpu_kernel void @local_system_one_as_monotonic_atomicrmw( ; GFX1250-LABEL: local_system_one_as_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s0 ; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX1250-NEXT: s_endpgm @@ -11412,133 +15126,183 @@ define amdgpu_kernel void @local_system_one_as_acquire_atomicrmw( ; GFX6-LABEL: local_system_one_as_acquire_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_one_as_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_one_as_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_system_one_as_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_acquire_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_system_one_as_acquire_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_one_as_acquire_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_system_one_as_acquire_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_system_one_as_acquire_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_system_one_as_acquire_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_system_one_as_acquire_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm @@ -11546,10 +15310,14 @@ define amdgpu_kernel void @local_system_one_as_acquire_atomicrmw( ; GFX1250-LABEL: local_system_one_as_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s0 ; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX1250-NEXT: s_endpgm @@ -11563,133 +15331,183 @@ define amdgpu_kernel void @local_system_one_as_release_atomicrmw( ; GFX6-LABEL: local_system_one_as_release_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_one_as_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_one_as_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_system_one_as_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_release_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_system_one_as_release_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_one_as_release_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_system_one_as_release_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_system_one_as_release_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_system_one_as_release_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_system_one_as_release_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm @@ -11697,10 +15515,14 @@ define amdgpu_kernel void @local_system_one_as_release_atomicrmw( ; GFX1250-LABEL: local_system_one_as_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s0 ; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX1250-NEXT: s_endpgm @@ -11714,133 +15536,183 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_atomicrmw( ; GFX6-LABEL: local_system_one_as_acq_rel_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_one_as_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_one_as_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_system_one_as_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_acq_rel_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_one_as_acq_rel_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_system_one_as_acq_rel_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_system_one_as_acq_rel_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_system_one_as_acq_rel_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_system_one_as_acq_rel_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm @@ -11848,10 +15720,14 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_atomicrmw( ; GFX1250-LABEL: local_system_one_as_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s0 ; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX1250-NEXT: s_endpgm @@ -11865,133 +15741,183 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_atomicrmw( ; GFX6-LABEL: local_system_one_as_seq_cst_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_one_as_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_one_as_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_system_one_as_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_seq_cst_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_one_as_seq_cst_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_system_one_as_seq_cst_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_system_one_as_seq_cst_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_system_one_as_seq_cst_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_system_one_as_seq_cst_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm @@ -11999,10 +15925,14 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_atomicrmw( ; GFX1250-LABEL: local_system_one_as_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s0 ; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX1250-NEXT: s_endpgm @@ -12016,11 +15946,13 @@ define amdgpu_kernel void @local_system_one_as_acquire_ret_atomicrmw( ; GFX6-LABEL: local_system_one_as_acquire_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 ; GFX6-NEXT: s_mov_b32 m0, -1 @@ -12032,6 +15964,10 @@ define amdgpu_kernel void @local_system_one_as_acquire_ret_atomicrmw( ; GFX7-LABEL: local_system_one_as_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12047,6 +15983,10 @@ define amdgpu_kernel void @local_system_one_as_acquire_ret_atomicrmw( ; GFX10-WGP-LABEL: local_system_one_as_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -12060,6 +16000,10 @@ define amdgpu_kernel void @local_system_one_as_acquire_ret_atomicrmw( ; GFX10-CU-LABEL: local_system_one_as_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -12073,6 +16017,10 @@ define amdgpu_kernel void @local_system_one_as_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: local_system_one_as_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -12088,6 +16036,10 @@ define amdgpu_kernel void @local_system_one_as_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -12101,6 +16053,10 @@ define amdgpu_kernel void @local_system_one_as_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-LABEL: local_system_one_as_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -12114,6 +16070,10 @@ define amdgpu_kernel void @local_system_one_as_acquire_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: local_system_one_as_acquire_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 @@ -12127,6 +16087,10 @@ define amdgpu_kernel void @local_system_one_as_acquire_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: local_system_one_as_acquire_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 @@ -12140,6 +16104,10 @@ define amdgpu_kernel void @local_system_one_as_acquire_ret_atomicrmw( ; GFX11-WGP-LABEL: local_system_one_as_acquire_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -12153,6 +16121,10 @@ define amdgpu_kernel void @local_system_one_as_acquire_ret_atomicrmw( ; GFX11-CU-LABEL: local_system_one_as_acquire_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -12166,6 +16138,10 @@ define amdgpu_kernel void @local_system_one_as_acquire_ret_atomicrmw( ; GFX12-WGP-LABEL: local_system_one_as_acquire_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -12179,6 +16155,10 @@ define amdgpu_kernel void @local_system_one_as_acquire_ret_atomicrmw( ; GFX12-CU-LABEL: local_system_one_as_acquire_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -12193,6 +16173,10 @@ define amdgpu_kernel void @local_system_one_as_acquire_ret_atomicrmw( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 @@ -12213,11 +16197,13 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_ret_atomicrmw( ; GFX6-LABEL: local_system_one_as_acq_rel_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 ; GFX6-NEXT: s_mov_b32 m0, -1 @@ -12229,6 +16215,10 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_ret_atomicrmw( ; GFX7-LABEL: local_system_one_as_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12244,6 +16234,10 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_ret_atomicrmw( ; GFX10-WGP-LABEL: local_system_one_as_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -12257,6 +16251,10 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_ret_atomicrmw( ; GFX10-CU-LABEL: local_system_one_as_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -12270,6 +16268,10 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: local_system_one_as_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -12285,6 +16287,10 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -12298,6 +16304,10 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-LABEL: local_system_one_as_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -12311,6 +16321,10 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 @@ -12324,6 +16338,10 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: local_system_one_as_acq_rel_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 @@ -12337,6 +16355,10 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_ret_atomicrmw( ; GFX11-WGP-LABEL: local_system_one_as_acq_rel_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -12350,6 +16372,10 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_ret_atomicrmw( ; GFX11-CU-LABEL: local_system_one_as_acq_rel_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -12363,6 +16389,10 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_ret_atomicrmw( ; GFX12-WGP-LABEL: local_system_one_as_acq_rel_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -12376,6 +16406,10 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-LABEL: local_system_one_as_acq_rel_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -12390,6 +16424,10 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_ret_atomicrmw( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 @@ -12410,11 +16448,13 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_ret_atomicrmw( ; GFX6-LABEL: local_system_one_as_seq_cst_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 ; GFX6-NEXT: s_mov_b32 m0, -1 @@ -12426,6 +16466,10 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_ret_atomicrmw( ; GFX7-LABEL: local_system_one_as_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12441,6 +16485,10 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_ret_atomicrmw( ; GFX10-WGP-LABEL: local_system_one_as_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -12454,6 +16502,10 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_ret_atomicrmw( ; GFX10-CU-LABEL: local_system_one_as_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -12467,6 +16519,10 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: local_system_one_as_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -12482,6 +16538,10 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -12495,6 +16555,10 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-LABEL: local_system_one_as_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -12508,6 +16572,10 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 @@ -12521,6 +16589,10 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: local_system_one_as_seq_cst_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 @@ -12534,6 +16606,10 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_ret_atomicrmw( ; GFX11-WGP-LABEL: local_system_one_as_seq_cst_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -12547,6 +16623,10 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_ret_atomicrmw( ; GFX11-CU-LABEL: local_system_one_as_seq_cst_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -12560,6 +16640,10 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_ret_atomicrmw( ; GFX12-WGP-LABEL: local_system_one_as_seq_cst_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -12573,6 +16657,10 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-LABEL: local_system_one_as_seq_cst_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -12587,6 +16675,10 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_ret_atomicrmw( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 @@ -12607,12 +16699,16 @@ define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_cmpxchg( ; GFX6-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12620,12 +16716,18 @@ define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_cmpxchg( ; ; GFX7-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12633,11 +16735,17 @@ define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12645,11 +16753,17 @@ define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12657,12 +16771,18 @@ define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12670,11 +16790,17 @@ define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12682,11 +16808,17 @@ define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12694,11 +16826,17 @@ define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12706,11 +16844,17 @@ define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12718,48 +16862,72 @@ define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -12767,12 +16935,18 @@ define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_cmpxchg( ; GFX1250-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -12787,12 +16961,16 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_cmpxchg( ; GFX6-LABEL: local_system_one_as_acquire_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12800,12 +16978,18 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_cmpxchg( ; ; GFX7-LABEL: local_system_one_as_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12813,11 +16997,17 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_one_as_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12825,11 +17015,17 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_system_one_as_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12837,12 +17033,18 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12850,11 +17052,17 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12862,11 +17070,17 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12874,11 +17088,17 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_system_one_as_acquire_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12886,11 +17106,17 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_system_one_as_acquire_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12898,48 +17124,72 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_one_as_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_system_one_as_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_system_one_as_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_system_one_as_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -12947,12 +17197,18 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_cmpxchg( ; GFX1250-LABEL: local_system_one_as_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -12967,12 +17223,16 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_cmpxchg( ; GFX6-LABEL: local_system_one_as_release_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12980,12 +17240,18 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_cmpxchg( ; ; GFX7-LABEL: local_system_one_as_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12993,11 +17259,17 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_one_as_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13005,11 +17277,17 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_system_one_as_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13017,12 +17295,18 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13030,11 +17314,17 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13042,11 +17332,17 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13054,11 +17350,17 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_system_one_as_release_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13066,11 +17368,17 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_system_one_as_release_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13078,48 +17386,72 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_one_as_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_system_one_as_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_system_one_as_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_system_one_as_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -13127,12 +17459,18 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_cmpxchg( ; GFX1250-LABEL: local_system_one_as_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -13147,12 +17485,16 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX6-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13160,12 +17502,18 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX7-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13173,11 +17521,17 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13185,11 +17539,17 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13197,12 +17557,18 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13210,11 +17576,17 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13222,11 +17594,17 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13234,11 +17612,17 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13246,11 +17630,17 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13258,48 +17648,72 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -13307,12 +17721,18 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX1250-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -13327,12 +17747,16 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX6-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13340,12 +17764,18 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX7-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13353,11 +17783,17 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13365,11 +17801,17 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13377,12 +17819,18 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13390,11 +17838,17 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13402,11 +17856,17 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13414,11 +17874,17 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13426,11 +17892,17 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13438,48 +17910,72 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -13487,12 +17983,18 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX1250-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -13507,12 +18009,16 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_cmpxchg( ; GFX6-LABEL: local_system_one_as_monotonic_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13520,12 +18026,18 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_cmpxchg( ; ; GFX7-LABEL: local_system_one_as_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13533,11 +18045,17 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_one_as_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13545,11 +18063,17 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_system_one_as_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13557,12 +18081,18 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13570,11 +18100,17 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13582,11 +18118,17 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13594,11 +18136,17 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13606,11 +18154,17 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_system_one_as_monotonic_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13618,48 +18172,72 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_one_as_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_system_one_as_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_system_one_as_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_system_one_as_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -13667,12 +18245,18 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_cmpxchg( ; GFX1250-LABEL: local_system_one_as_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -13687,12 +18271,16 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_cmpxchg( ; GFX6-LABEL: local_system_one_as_acquire_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13700,12 +18288,18 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_cmpxchg( ; ; GFX7-LABEL: local_system_one_as_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13713,11 +18307,17 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_one_as_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13725,11 +18325,17 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_system_one_as_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13737,12 +18343,18 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13750,11 +18362,17 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13762,11 +18380,17 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13774,11 +18398,17 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_system_one_as_acquire_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13786,11 +18416,17 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_system_one_as_acquire_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13798,48 +18434,72 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_one_as_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_system_one_as_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_system_one_as_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_system_one_as_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -13847,12 +18507,18 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_cmpxchg( ; GFX1250-LABEL: local_system_one_as_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -13867,12 +18533,16 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_cmpxchg( ; GFX6-LABEL: local_system_one_as_release_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13880,12 +18550,18 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_cmpxchg( ; ; GFX7-LABEL: local_system_one_as_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13893,11 +18569,17 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_one_as_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13905,11 +18587,17 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_system_one_as_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13917,12 +18605,18 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13930,11 +18624,17 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13942,11 +18642,17 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13954,11 +18660,17 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_system_one_as_release_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13966,11 +18678,17 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_system_one_as_release_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13978,48 +18696,72 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_one_as_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_system_one_as_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_system_one_as_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_system_one_as_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -14027,12 +18769,18 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_cmpxchg( ; GFX1250-LABEL: local_system_one_as_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -14047,12 +18795,16 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_cmpxchg( ; GFX6-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14060,12 +18812,18 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_cmpxchg( ; ; GFX7-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14073,11 +18831,17 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14085,11 +18849,17 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14097,12 +18867,18 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14110,11 +18886,17 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14122,11 +18904,17 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14134,11 +18922,17 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14146,11 +18940,17 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14158,48 +18958,72 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -14207,12 +19031,18 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_cmpxchg( ; GFX1250-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -14227,12 +19057,16 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_cmpxchg( ; GFX6-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14240,12 +19074,18 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_cmpxchg( ; ; GFX7-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14253,11 +19093,17 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14265,11 +19111,17 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14277,12 +19129,18 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14290,11 +19148,17 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14302,11 +19166,17 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14314,11 +19184,17 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14326,11 +19202,17 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14338,48 +19220,72 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -14387,12 +19293,18 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_cmpxchg( ; GFX1250-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -14407,12 +19319,16 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX6-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14420,12 +19336,18 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14433,11 +19355,17 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14445,11 +19373,17 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14457,12 +19391,18 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14470,11 +19410,17 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14482,11 +19428,17 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14494,11 +19446,17 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14506,11 +19464,17 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14518,48 +19482,72 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -14567,12 +19555,18 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX1250-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -14587,12 +19581,16 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_cmpxchg( ; GFX6-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14600,12 +19598,18 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14613,11 +19617,17 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14625,11 +19635,17 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14637,12 +19653,18 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14650,11 +19672,17 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14662,11 +19690,17 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14674,11 +19708,17 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14686,11 +19726,17 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14698,48 +19744,72 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -14747,12 +19817,18 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_cmpxchg( ; GFX1250-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -14767,12 +19843,16 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_cmpxchg( ; GFX6-LABEL: local_system_one_as_release_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14780,12 +19860,18 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_system_one_as_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14793,11 +19879,17 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_one_as_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14805,11 +19897,17 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_system_one_as_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14817,12 +19915,18 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14830,11 +19934,17 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14842,11 +19952,17 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14854,11 +19970,17 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_system_one_as_release_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14866,11 +19988,17 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_system_one_as_release_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14878,48 +20006,72 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_one_as_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_system_one_as_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_system_one_as_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_system_one_as_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -14927,12 +20079,18 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_cmpxchg( ; GFX1250-LABEL: local_system_one_as_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -14947,12 +20105,16 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX6-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14960,12 +20122,18 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14973,11 +20141,17 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14985,11 +20159,17 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14997,12 +20177,18 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -15010,11 +20196,17 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -15022,11 +20214,17 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -15034,11 +20232,17 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -15046,11 +20250,17 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -15058,48 +20268,72 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -15107,12 +20341,18 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX1250-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -15127,12 +20367,16 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX6-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -15140,12 +20384,18 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -15153,11 +20403,17 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -15165,11 +20421,17 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -15177,12 +20439,18 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -15190,11 +20458,17 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -15202,11 +20476,17 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -15214,11 +20494,17 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -15226,11 +20512,17 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -15238,48 +20530,72 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -15287,12 +20603,18 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX1250-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -15307,12 +20629,16 @@ define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -15325,6 +20651,12 @@ define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX7-LABEL: local_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -15342,6 +20674,12 @@ define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX10-WGP-LABEL: local_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -15357,6 +20695,12 @@ define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX10-CU-LABEL: local_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -15372,6 +20716,12 @@ define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_system_one_as_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -15389,6 +20739,12 @@ define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15404,6 +20760,12 @@ define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15419,6 +20781,12 @@ define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15434,6 +20802,12 @@ define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15449,6 +20823,12 @@ define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: local_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -15464,6 +20844,12 @@ define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: local_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -15479,6 +20865,12 @@ define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -15494,6 +20886,12 @@ define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -15510,6 +20908,12 @@ define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -15534,12 +20938,16 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -15552,6 +20960,12 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX7-LABEL: local_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -15569,6 +20983,12 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX10-WGP-LABEL: local_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -15584,6 +21004,12 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX10-CU-LABEL: local_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -15599,6 +21025,12 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_system_one_as_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -15616,6 +21048,12 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15631,6 +21069,12 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15646,6 +21090,12 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15661,6 +21111,12 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15676,6 +21132,12 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: local_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -15691,6 +21153,12 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: local_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -15706,6 +21174,12 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -15721,6 +21195,12 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -15737,6 +21217,12 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -15761,12 +21247,16 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_system_one_as_release_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -15779,6 +21269,12 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_ret_cmpxchg( ; GFX7-LABEL: local_system_one_as_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -15796,6 +21292,12 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_ret_cmpxchg( ; GFX10-WGP-LABEL: local_system_one_as_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -15811,6 +21313,12 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_ret_cmpxchg( ; GFX10-CU-LABEL: local_system_one_as_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -15826,6 +21334,12 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_system_one_as_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -15843,6 +21357,12 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15858,6 +21378,12 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_system_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15873,6 +21399,12 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_system_one_as_release_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15888,6 +21420,12 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_system_one_as_release_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15903,6 +21441,12 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: local_system_one_as_release_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -15918,6 +21462,12 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: local_system_one_as_release_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -15933,6 +21483,12 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_system_one_as_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -15948,6 +21504,12 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_system_one_as_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -15964,6 +21526,12 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -15988,12 +21556,16 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -16006,6 +21578,12 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX7-LABEL: local_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -16023,6 +21601,12 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX10-WGP-LABEL: local_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16038,6 +21622,12 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX10-CU-LABEL: local_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16053,6 +21643,12 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -16070,6 +21666,12 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16085,6 +21687,12 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16100,6 +21708,12 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16115,6 +21729,12 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16130,6 +21750,12 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: local_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16145,6 +21771,12 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: local_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16160,6 +21792,12 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -16175,6 +21813,12 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -16191,6 +21835,12 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -16215,12 +21865,16 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -16233,6 +21887,12 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX7-LABEL: local_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -16250,6 +21910,12 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX10-WGP-LABEL: local_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16265,6 +21931,12 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX10-CU-LABEL: local_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16280,6 +21952,12 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -16297,6 +21975,12 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16312,6 +21996,12 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16327,6 +22017,12 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16342,6 +22038,12 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16357,6 +22059,12 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: local_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16372,6 +22080,12 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: local_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16387,6 +22101,12 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -16402,6 +22122,12 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -16418,6 +22144,12 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -16442,12 +22174,16 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX6-LABEL: local_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -16460,6 +22196,12 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX7-LABEL: local_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -16477,6 +22219,12 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: local_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16492,6 +22240,12 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX10-CU-LABEL: local_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16507,6 +22261,12 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_system_one_as_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -16524,6 +22284,12 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16539,6 +22305,12 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16554,6 +22326,12 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16569,6 +22347,12 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16584,6 +22368,12 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: local_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16599,6 +22389,12 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: local_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16614,6 +22410,12 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -16629,6 +22431,12 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -16645,6 +22453,12 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -16669,12 +22483,16 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX6-LABEL: local_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -16687,6 +22505,12 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX7-LABEL: local_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -16704,6 +22528,12 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: local_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16719,6 +22549,12 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX10-CU-LABEL: local_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16734,6 +22570,12 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_system_one_as_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -16751,6 +22593,12 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16766,6 +22614,12 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16781,6 +22635,12 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16796,6 +22656,12 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16811,6 +22677,12 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: local_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16826,6 +22698,12 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: local_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16841,6 +22719,12 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -16856,6 +22740,12 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -16872,6 +22762,12 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -16896,12 +22792,16 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_ret_cmpxchg( ; GFX6-LABEL: local_system_one_as_release_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -16914,6 +22814,12 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_ret_cmpxchg( ; GFX7-LABEL: local_system_one_as_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -16931,6 +22837,12 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: local_system_one_as_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16946,6 +22858,12 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_ret_cmpxchg( ; GFX10-CU-LABEL: local_system_one_as_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16961,6 +22879,12 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_system_one_as_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -16978,6 +22902,12 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16993,6 +22923,12 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_system_one_as_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17008,6 +22944,12 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_system_one_as_release_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17023,6 +22965,12 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_system_one_as_release_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17038,6 +22986,12 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: local_system_one_as_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17053,6 +23007,12 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: local_system_one_as_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17068,6 +23028,12 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_system_one_as_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -17083,6 +23049,12 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_system_one_as_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -17099,6 +23071,12 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -17123,12 +23101,16 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX6-LABEL: local_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -17141,6 +23123,12 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX7-LABEL: local_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -17158,6 +23146,12 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: local_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17173,6 +23167,12 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX10-CU-LABEL: local_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17188,6 +23188,12 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_system_one_as_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -17205,6 +23211,12 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17220,6 +23232,12 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17235,6 +23253,12 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17250,6 +23274,12 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17265,6 +23295,12 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: local_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17280,6 +23316,12 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: local_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17295,6 +23337,12 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -17310,6 +23358,12 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -17326,6 +23380,12 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -17350,12 +23410,16 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX6-LABEL: local_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -17368,6 +23432,12 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX7-LABEL: local_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -17385,6 +23455,12 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: local_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17400,6 +23476,12 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX10-CU-LABEL: local_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17415,6 +23497,12 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_system_one_as_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -17432,6 +23520,12 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17447,6 +23541,12 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17462,6 +23562,12 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17477,6 +23583,12 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17492,6 +23604,12 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: local_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17507,6 +23625,12 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: local_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17522,6 +23646,12 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -17537,6 +23667,12 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -17553,6 +23689,12 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -17577,12 +23719,16 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -17595,6 +23741,12 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX7-LABEL: local_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -17612,6 +23764,12 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX10-WGP-LABEL: local_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17627,6 +23785,12 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX10-CU-LABEL: local_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17642,6 +23806,12 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -17659,6 +23829,12 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17674,6 +23850,12 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17689,6 +23871,12 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17704,6 +23892,12 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17719,6 +23913,12 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: local_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17734,6 +23934,12 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: local_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17749,6 +23955,12 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -17764,6 +23976,12 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -17780,6 +23998,12 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -17804,12 +24028,16 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -17822,6 +24050,12 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX7-LABEL: local_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -17839,6 +24073,12 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX10-WGP-LABEL: local_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17854,6 +24094,12 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX10-CU-LABEL: local_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17869,6 +24115,12 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_system_one_as_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -17886,6 +24138,12 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17901,6 +24159,12 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17916,6 +24180,12 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17931,6 +24201,12 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17946,6 +24222,12 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: local_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17961,6 +24243,12 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: local_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17976,6 +24264,12 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -17991,6 +24285,12 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -18007,6 +24307,12 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -18031,12 +24337,16 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -18049,6 +24359,12 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX7-LABEL: local_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -18066,6 +24382,12 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX10-WGP-LABEL: local_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -18081,6 +24403,12 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX10-CU-LABEL: local_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -18096,6 +24424,12 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_system_one_as_release_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -18113,6 +24447,12 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18128,6 +24468,12 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18143,6 +24489,12 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18158,6 +24510,12 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18173,6 +24531,12 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: local_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -18188,6 +24552,12 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: local_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -18203,6 +24573,12 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -18218,6 +24594,12 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -18234,6 +24616,12 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -18258,12 +24646,16 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -18276,6 +24668,12 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-LABEL: local_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -18293,6 +24691,12 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-WGP-LABEL: local_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -18308,6 +24712,12 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-CU-LABEL: local_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -18323,6 +24733,12 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -18340,6 +24756,12 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18355,6 +24777,12 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18370,6 +24798,12 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18385,6 +24819,12 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18400,6 +24840,12 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: local_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -18415,6 +24861,12 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: local_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -18430,6 +24882,12 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -18445,6 +24903,12 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -18461,6 +24925,12 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -18485,12 +24955,16 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -18503,6 +24977,12 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-LABEL: local_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -18520,6 +25000,12 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-WGP-LABEL: local_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -18535,6 +25021,12 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-CU-LABEL: local_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -18550,6 +25042,12 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -18567,6 +25065,12 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18582,6 +25086,12 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18597,6 +25107,12 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18612,6 +25128,12 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18627,6 +25149,12 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: local_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -18642,6 +25170,12 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: local_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -18657,6 +25191,12 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -18672,6 +25212,12 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -18688,6 +25234,12 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll index 7e61338dce83..f2caa062df32 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll @@ -15,6 +15,7 @@ define amdgpu_kernel void @local_volatile_load_0( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[0:1], s[4:5] ; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb ; GFX6-NEXT: ; kill: def $sgpr2 killed $sgpr4 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -38,24 +39,30 @@ define amdgpu_kernel void @local_volatile_load_0( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ds_read_b32 v2, v0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_volatile_load_0: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: ds_read_b32 v1, v1 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -64,10 +71,13 @@ define amdgpu_kernel void @local_volatile_load_0( ; ; GFX10-CU-LABEL: local_volatile_load_0: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: ds_read_b32 v1, v1 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -77,6 +87,9 @@ define amdgpu_kernel void @local_volatile_load_0( ; SKIP-CACHE-INV-LABEL: local_volatile_load_0: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -97,10 +110,13 @@ define amdgpu_kernel void @local_volatile_load_0( ; ; GFX11-WGP-LABEL: local_volatile_load_0: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: ds_load_b32 v1, v1 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -109,10 +125,13 @@ define amdgpu_kernel void @local_volatile_load_0( ; ; GFX11-CU-LABEL: local_volatile_load_0: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: ds_load_b32 v1, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -121,38 +140,50 @@ define amdgpu_kernel void @local_volatile_load_0( ; ; GFX12-WGP-LABEL: local_volatile_load_0: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: ds_load_b32 v1, v1 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_volatile_load_0: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: ds_load_b32 v1, v1 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: local_volatile_load_0: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ds_load_b32 v1, v1 ; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(1) %out) { @@ -167,6 +198,7 @@ define amdgpu_kernel void @local_volatile_load_1( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[0:1], s[4:5] ; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb ; GFX6-NEXT: ; kill: def $sgpr2 killed $sgpr4 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -192,27 +224,33 @@ define amdgpu_kernel void @local_volatile_load_1( ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 2 ; GFX7-NEXT: v_lshlrev_b32_e64 v0, s7, v0 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e64 v0, s[6:7], s6, v0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: ds_read_b32 v2, v0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_volatile_load_1: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_lshl_add_u32 v1, v1, 2, s6 ; GFX10-WGP-NEXT: ds_read_b32 v1, v1 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -222,10 +260,13 @@ define amdgpu_kernel void @local_volatile_load_1( ; GFX10-CU-LABEL: local_volatile_load_1: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_lshl_add_u32 v1, v1, 2, s6 ; GFX10-CU-NEXT: ds_read_b32 v1, v1 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -235,6 +276,9 @@ define amdgpu_kernel void @local_volatile_load_1( ; SKIP-CACHE-INV-LABEL: local_volatile_load_1: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -258,12 +302,15 @@ define amdgpu_kernel void @local_volatile_load_1( ; GFX11-WGP-LABEL: local_volatile_load_1: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_mov_b32 s3, 0x3ff ; GFX11-WGP-NEXT: v_and_b32_e64 v1, v1, s3 -; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_lshl_add_u32 v1, v1, 2, s2 ; GFX11-WGP-NEXT: ds_load_b32 v1, v1 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -273,12 +320,15 @@ define amdgpu_kernel void @local_volatile_load_1( ; GFX11-CU-LABEL: local_volatile_load_1: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_mov_b32 s3, 0x3ff ; GFX11-CU-NEXT: v_and_b32_e64 v1, v1, s3 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_lshl_add_u32 v1, v1, 2, s2 ; GFX11-CU-NEXT: ds_load_b32 v1, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -288,30 +338,38 @@ define amdgpu_kernel void @local_volatile_load_1( ; GFX12-WGP-LABEL: local_volatile_load_1: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_mov_b32 s3, 0x3ff ; GFX12-WGP-NEXT: v_and_b32_e64 v1, v1, s3 -; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_lshl_add_u32 v1, v1, 2, s2 ; GFX12-WGP-NEXT: ds_load_b32 v1, v1 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_volatile_load_1: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: v_mov_b32_e32 v1, v0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_mov_b32 s3, 0x3ff ; GFX12-CU-NEXT: v_and_b32_e64 v1, v1, s3 -; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_lshl_add_u32 v1, v1, 2, s2 ; GFX12-CU-NEXT: ds_load_b32 v1, v1 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; @@ -319,15 +377,19 @@ define amdgpu_kernel void @local_volatile_load_1( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: v_mov_b32_e32 v1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_mov_b32 s3, 0x3ff ; GFX1250-NEXT: v_and_b32_e64 v1, v1, s3 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_lshl_add_u32 v1, v1, 2, s2 ; GFX1250-NEXT: ds_load_b32 v1, v1 ; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(1) %out) { @@ -342,6 +404,8 @@ entry: define amdgpu_kernel void @local_volatile_store_0( ; GFX6-LABEL: local_volatile_store_0: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s1, s[4:5], 0xb ; GFX6-NEXT: ; kill: def $sgpr0 killed $sgpr1 ; GFX6-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x9 @@ -356,6 +420,9 @@ define amdgpu_kernel void @local_volatile_store_0( ; ; GFX7-LABEL: local_volatile_store_0: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -369,6 +436,9 @@ define amdgpu_kernel void @local_volatile_store_0( ; ; GFX10-WGP-LABEL: local_volatile_store_0: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -381,6 +451,9 @@ define amdgpu_kernel void @local_volatile_store_0( ; ; GFX10-CU-LABEL: local_volatile_store_0: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -393,6 +466,9 @@ define amdgpu_kernel void @local_volatile_store_0( ; ; SKIP-CACHE-INV-LABEL: local_volatile_store_0: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -406,6 +482,9 @@ define amdgpu_kernel void @local_volatile_store_0( ; ; GFX11-WGP-LABEL: local_volatile_store_0: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -418,6 +497,9 @@ define amdgpu_kernel void @local_volatile_store_0( ; ; GFX11-CU-LABEL: local_volatile_store_0: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -430,6 +512,9 @@ define amdgpu_kernel void @local_volatile_store_0( ; ; GFX12-WGP-LABEL: local_volatile_store_0: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -442,6 +527,9 @@ define amdgpu_kernel void @local_volatile_store_0( ; ; GFX12-CU-LABEL: local_volatile_store_0: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -455,6 +543,9 @@ define amdgpu_kernel void @local_volatile_store_0( ; GFX1250-LABEL: local_volatile_store_0: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -474,6 +565,8 @@ entry: define amdgpu_kernel void @local_volatile_store_1( ; GFX6-LABEL: local_volatile_store_1: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s1, s[4:5], 0xb ; GFX6-NEXT: ; kill: def $sgpr0 killed $sgpr1 ; GFX6-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x9 @@ -490,6 +583,9 @@ define amdgpu_kernel void @local_volatile_store_1( ; ; GFX7-LABEL: local_volatile_store_1: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -505,6 +601,9 @@ define amdgpu_kernel void @local_volatile_store_1( ; ; GFX10-WGP-LABEL: local_volatile_store_1: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -517,6 +616,9 @@ define amdgpu_kernel void @local_volatile_store_1( ; ; GFX10-CU-LABEL: local_volatile_store_1: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -529,6 +631,9 @@ define amdgpu_kernel void @local_volatile_store_1( ; ; SKIP-CACHE-INV-LABEL: local_volatile_store_1: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -544,6 +649,9 @@ define amdgpu_kernel void @local_volatile_store_1( ; ; GFX11-WGP-LABEL: local_volatile_store_1: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -558,6 +666,9 @@ define amdgpu_kernel void @local_volatile_store_1( ; ; GFX11-CU-LABEL: local_volatile_store_1: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -572,6 +683,9 @@ define amdgpu_kernel void @local_volatile_store_1( ; ; GFX12-WGP-LABEL: local_volatile_store_1: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -586,6 +700,9 @@ define amdgpu_kernel void @local_volatile_store_1( ; ; GFX12-CU-LABEL: local_volatile_store_1: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -601,6 +718,9 @@ define amdgpu_kernel void @local_volatile_store_1( ; GFX1250-LABEL: local_volatile_store_1: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -641,10 +761,13 @@ define amdgpu_kernel void @local_volatile_workgroup_acquire_load( ; ; GFX7-LABEL: local_volatile_workgroup_acquire_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: ds_read_b32 v1, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -655,9 +778,12 @@ define amdgpu_kernel void @local_volatile_workgroup_acquire_load( ; ; GFX10-WGP-LABEL: local_volatile_workgroup_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -668,9 +794,12 @@ define amdgpu_kernel void @local_volatile_workgroup_acquire_load( ; ; GFX10-CU-LABEL: local_volatile_workgroup_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -680,10 +809,13 @@ define amdgpu_kernel void @local_volatile_workgroup_acquire_load( ; ; SKIP-CACHE-INV-LABEL: local_volatile_workgroup_acquire_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -694,9 +826,12 @@ define amdgpu_kernel void @local_volatile_workgroup_acquire_load( ; ; GFX11-WGP-LABEL: local_volatile_workgroup_acquire_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -707,9 +842,12 @@ define amdgpu_kernel void @local_volatile_workgroup_acquire_load( ; ; GFX11-CU-LABEL: local_volatile_workgroup_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -719,25 +857,33 @@ define amdgpu_kernel void @local_volatile_workgroup_acquire_load( ; ; GFX12-WGP-LABEL: local_volatile_workgroup_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: ds_load_b32 v1, v0 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: ds_store_b32 v0, v1 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_volatile_workgroup_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm @@ -745,12 +891,16 @@ define amdgpu_kernel void @local_volatile_workgroup_acquire_load( ; GFX1250-LABEL: local_volatile_workgroup_acquire_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: ds_load_b32 v1, v0 ; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: ds_store_b32 v0, v1 ; GFX1250-NEXT: s_endpgm @@ -764,12 +914,14 @@ entry: define amdgpu_kernel void @local_volatile_workgroup_release_store( ; GFX6-LABEL: local_volatile_workgroup_release_store: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x9 ; GFX6-NEXT: s_load_dword s1, s[4:5], 0xa ; GFX6-NEXT: ; kill: def $sgpr0 killed $sgpr1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s0, s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -778,6 +930,10 @@ define amdgpu_kernel void @local_volatile_workgroup_release_store( ; GFX7-LABEL: local_volatile_workgroup_release_store: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -790,6 +946,10 @@ define amdgpu_kernel void @local_volatile_workgroup_release_store( ; GFX10-WGP-LABEL: local_volatile_workgroup_release_store: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 @@ -802,6 +962,10 @@ define amdgpu_kernel void @local_volatile_workgroup_release_store( ; GFX10-CU-LABEL: local_volatile_workgroup_release_store: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 @@ -814,6 +978,10 @@ define amdgpu_kernel void @local_volatile_workgroup_release_store( ; SKIP-CACHE-INV-LABEL: local_volatile_workgroup_release_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -826,6 +994,10 @@ define amdgpu_kernel void @local_volatile_workgroup_release_store( ; GFX11-WGP-LABEL: local_volatile_workgroup_release_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -838,6 +1010,10 @@ define amdgpu_kernel void @local_volatile_workgroup_release_store( ; GFX11-CU-LABEL: local_volatile_workgroup_release_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -850,6 +1026,10 @@ define amdgpu_kernel void @local_volatile_workgroup_release_store( ; GFX12-WGP-LABEL: local_volatile_workgroup_release_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -864,6 +1044,10 @@ define amdgpu_kernel void @local_volatile_workgroup_release_store( ; GFX12-CU-LABEL: local_volatile_workgroup_release_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -879,6 +1063,10 @@ define amdgpu_kernel void @local_volatile_workgroup_release_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll index adc080a91804..82542e26711e 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll @@ -33,135 +33,169 @@ define amdgpu_kernel void @local_wavefront_unordered_load( ; ; GFX7-LABEL: local_wavefront_unordered_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: ds_read_b32 v1, v0 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_wavefront_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_wavefront_unordered_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_unordered_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_unordered_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_wavefront_unordered_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_wavefront_unordered_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_wavefront_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: ds_load_b32 v1, v0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: ds_store_b32 v0, v1 @@ -169,11 +203,15 @@ define amdgpu_kernel void @local_wavefront_unordered_load( ; ; GFX12-CU-LABEL: local_wavefront_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 @@ -182,11 +220,15 @@ define amdgpu_kernel void @local_wavefront_unordered_load( ; GFX1250-LABEL: local_wavefront_unordered_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: ds_store_b32 v0, v1 @@ -217,135 +259,169 @@ define amdgpu_kernel void @local_wavefront_monotonic_load( ; ; GFX7-LABEL: local_wavefront_monotonic_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: ds_read_b32 v1, v0 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_wavefront_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_wavefront_monotonic_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_monotonic_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_monotonic_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_wavefront_monotonic_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_wavefront_monotonic_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_wavefront_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: ds_load_b32 v1, v0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: ds_store_b32 v0, v1 @@ -353,11 +429,15 @@ define amdgpu_kernel void @local_wavefront_monotonic_load( ; ; GFX12-CU-LABEL: local_wavefront_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 @@ -366,11 +446,15 @@ define amdgpu_kernel void @local_wavefront_monotonic_load( ; GFX1250-LABEL: local_wavefront_monotonic_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: ds_store_b32 v0, v1 @@ -401,135 +485,169 @@ define amdgpu_kernel void @local_wavefront_acquire_load( ; ; GFX7-LABEL: local_wavefront_acquire_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: ds_read_b32 v1, v0 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_wavefront_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_wavefront_acquire_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_acquire_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_acquire_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_wavefront_acquire_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_wavefront_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_wavefront_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: ds_load_b32 v1, v0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: ds_store_b32 v0, v1 @@ -537,11 +655,15 @@ define amdgpu_kernel void @local_wavefront_acquire_load( ; ; GFX12-CU-LABEL: local_wavefront_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 @@ -550,11 +672,15 @@ define amdgpu_kernel void @local_wavefront_acquire_load( ; GFX1250-LABEL: local_wavefront_acquire_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: ds_store_b32 v0, v1 @@ -585,135 +711,169 @@ define amdgpu_kernel void @local_wavefront_seq_cst_load( ; ; GFX7-LABEL: local_wavefront_seq_cst_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: ds_read_b32 v1, v0 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_wavefront_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_wavefront_seq_cst_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_seq_cst_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_wavefront_seq_cst_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_wavefront_seq_cst_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_wavefront_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: ds_load_b32 v1, v0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: ds_store_b32 v0, v1 @@ -721,11 +881,15 @@ define amdgpu_kernel void @local_wavefront_seq_cst_load( ; ; GFX12-CU-LABEL: local_wavefront_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 @@ -734,11 +898,15 @@ define amdgpu_kernel void @local_wavefront_seq_cst_load( ; GFX1250-LABEL: local_wavefront_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: ds_store_b32 v0, v1 @@ -753,12 +921,14 @@ entry: define amdgpu_kernel void @local_wavefront_unordered_store( ; GFX6-LABEL: local_wavefront_unordered_store: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm @@ -766,6 +936,10 @@ define amdgpu_kernel void @local_wavefront_unordered_store( ; GFX7-LABEL: local_wavefront_unordered_store: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -777,6 +951,10 @@ define amdgpu_kernel void @local_wavefront_unordered_store( ; GFX10-WGP-LABEL: local_wavefront_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 @@ -787,6 +965,10 @@ define amdgpu_kernel void @local_wavefront_unordered_store( ; GFX10-CU-LABEL: local_wavefront_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 @@ -797,6 +979,10 @@ define amdgpu_kernel void @local_wavefront_unordered_store( ; SKIP-CACHE-INV-LABEL: local_wavefront_unordered_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -808,6 +994,10 @@ define amdgpu_kernel void @local_wavefront_unordered_store( ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -818,6 +1008,10 @@ define amdgpu_kernel void @local_wavefront_unordered_store( ; GFX90A-TGSPLIT-LABEL: local_wavefront_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -828,6 +1022,10 @@ define amdgpu_kernel void @local_wavefront_unordered_store( ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_unordered_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -838,6 +1036,10 @@ define amdgpu_kernel void @local_wavefront_unordered_store( ; GFX942-TGSPLIT-LABEL: local_wavefront_unordered_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -848,6 +1050,10 @@ define amdgpu_kernel void @local_wavefront_unordered_store( ; GFX11-WGP-LABEL: local_wavefront_unordered_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -858,6 +1064,10 @@ define amdgpu_kernel void @local_wavefront_unordered_store( ; GFX11-CU-LABEL: local_wavefront_unordered_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -868,6 +1078,10 @@ define amdgpu_kernel void @local_wavefront_unordered_store( ; GFX12-WGP-LABEL: local_wavefront_unordered_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -878,6 +1092,10 @@ define amdgpu_kernel void @local_wavefront_unordered_store( ; GFX12-CU-LABEL: local_wavefront_unordered_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -889,6 +1107,10 @@ define amdgpu_kernel void @local_wavefront_unordered_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 @@ -904,12 +1126,14 @@ entry: define amdgpu_kernel void @local_wavefront_monotonic_store( ; GFX6-LABEL: local_wavefront_monotonic_store: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm @@ -917,6 +1141,10 @@ define amdgpu_kernel void @local_wavefront_monotonic_store( ; GFX7-LABEL: local_wavefront_monotonic_store: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -928,6 +1156,10 @@ define amdgpu_kernel void @local_wavefront_monotonic_store( ; GFX10-WGP-LABEL: local_wavefront_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 @@ -938,6 +1170,10 @@ define amdgpu_kernel void @local_wavefront_monotonic_store( ; GFX10-CU-LABEL: local_wavefront_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 @@ -948,6 +1184,10 @@ define amdgpu_kernel void @local_wavefront_monotonic_store( ; SKIP-CACHE-INV-LABEL: local_wavefront_monotonic_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -959,6 +1199,10 @@ define amdgpu_kernel void @local_wavefront_monotonic_store( ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -969,6 +1213,10 @@ define amdgpu_kernel void @local_wavefront_monotonic_store( ; GFX90A-TGSPLIT-LABEL: local_wavefront_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -979,6 +1227,10 @@ define amdgpu_kernel void @local_wavefront_monotonic_store( ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_monotonic_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -989,6 +1241,10 @@ define amdgpu_kernel void @local_wavefront_monotonic_store( ; GFX942-TGSPLIT-LABEL: local_wavefront_monotonic_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -999,6 +1255,10 @@ define amdgpu_kernel void @local_wavefront_monotonic_store( ; GFX11-WGP-LABEL: local_wavefront_monotonic_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -1009,6 +1269,10 @@ define amdgpu_kernel void @local_wavefront_monotonic_store( ; GFX11-CU-LABEL: local_wavefront_monotonic_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -1019,6 +1283,10 @@ define amdgpu_kernel void @local_wavefront_monotonic_store( ; GFX12-WGP-LABEL: local_wavefront_monotonic_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -1029,6 +1297,10 @@ define amdgpu_kernel void @local_wavefront_monotonic_store( ; GFX12-CU-LABEL: local_wavefront_monotonic_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -1040,6 +1312,10 @@ define amdgpu_kernel void @local_wavefront_monotonic_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 @@ -1055,12 +1331,14 @@ entry: define amdgpu_kernel void @local_wavefront_release_store( ; GFX6-LABEL: local_wavefront_release_store: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm @@ -1068,6 +1346,10 @@ define amdgpu_kernel void @local_wavefront_release_store( ; GFX7-LABEL: local_wavefront_release_store: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1079,6 +1361,10 @@ define amdgpu_kernel void @local_wavefront_release_store( ; GFX10-WGP-LABEL: local_wavefront_release_store: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 @@ -1089,6 +1375,10 @@ define amdgpu_kernel void @local_wavefront_release_store( ; GFX10-CU-LABEL: local_wavefront_release_store: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 @@ -1099,6 +1389,10 @@ define amdgpu_kernel void @local_wavefront_release_store( ; SKIP-CACHE-INV-LABEL: local_wavefront_release_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -1110,6 +1404,10 @@ define amdgpu_kernel void @local_wavefront_release_store( ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -1120,6 +1418,10 @@ define amdgpu_kernel void @local_wavefront_release_store( ; GFX90A-TGSPLIT-LABEL: local_wavefront_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -1130,6 +1432,10 @@ define amdgpu_kernel void @local_wavefront_release_store( ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_release_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -1140,6 +1446,10 @@ define amdgpu_kernel void @local_wavefront_release_store( ; GFX942-TGSPLIT-LABEL: local_wavefront_release_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -1150,6 +1460,10 @@ define amdgpu_kernel void @local_wavefront_release_store( ; GFX11-WGP-LABEL: local_wavefront_release_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -1160,6 +1474,10 @@ define amdgpu_kernel void @local_wavefront_release_store( ; GFX11-CU-LABEL: local_wavefront_release_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -1170,6 +1488,10 @@ define amdgpu_kernel void @local_wavefront_release_store( ; GFX12-WGP-LABEL: local_wavefront_release_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -1180,6 +1502,10 @@ define amdgpu_kernel void @local_wavefront_release_store( ; GFX12-CU-LABEL: local_wavefront_release_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -1191,6 +1517,10 @@ define amdgpu_kernel void @local_wavefront_release_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 @@ -1206,12 +1536,14 @@ entry: define amdgpu_kernel void @local_wavefront_seq_cst_store( ; GFX6-LABEL: local_wavefront_seq_cst_store: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm @@ -1219,6 +1551,10 @@ define amdgpu_kernel void @local_wavefront_seq_cst_store( ; GFX7-LABEL: local_wavefront_seq_cst_store: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1230,6 +1566,10 @@ define amdgpu_kernel void @local_wavefront_seq_cst_store( ; GFX10-WGP-LABEL: local_wavefront_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 @@ -1240,6 +1580,10 @@ define amdgpu_kernel void @local_wavefront_seq_cst_store( ; GFX10-CU-LABEL: local_wavefront_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 @@ -1250,6 +1594,10 @@ define amdgpu_kernel void @local_wavefront_seq_cst_store( ; SKIP-CACHE-INV-LABEL: local_wavefront_seq_cst_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -1261,6 +1609,10 @@ define amdgpu_kernel void @local_wavefront_seq_cst_store( ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -1271,6 +1623,10 @@ define amdgpu_kernel void @local_wavefront_seq_cst_store( ; GFX90A-TGSPLIT-LABEL: local_wavefront_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -1281,6 +1637,10 @@ define amdgpu_kernel void @local_wavefront_seq_cst_store( ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -1291,6 +1651,10 @@ define amdgpu_kernel void @local_wavefront_seq_cst_store( ; GFX942-TGSPLIT-LABEL: local_wavefront_seq_cst_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -1301,6 +1665,10 @@ define amdgpu_kernel void @local_wavefront_seq_cst_store( ; GFX11-WGP-LABEL: local_wavefront_seq_cst_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -1311,6 +1679,10 @@ define amdgpu_kernel void @local_wavefront_seq_cst_store( ; GFX11-CU-LABEL: local_wavefront_seq_cst_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -1321,6 +1693,10 @@ define amdgpu_kernel void @local_wavefront_seq_cst_store( ; GFX12-WGP-LABEL: local_wavefront_seq_cst_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -1331,6 +1707,10 @@ define amdgpu_kernel void @local_wavefront_seq_cst_store( ; GFX12-CU-LABEL: local_wavefront_seq_cst_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -1342,6 +1722,10 @@ define amdgpu_kernel void @local_wavefront_seq_cst_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 @@ -1358,133 +1742,183 @@ define amdgpu_kernel void @local_wavefront_monotonic_atomicrmw( ; GFX6-LABEL: local_wavefront_monotonic_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_wavefront_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_wavefront_monotonic_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_monotonic_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_monotonic_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_wavefront_monotonic_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_wavefront_monotonic_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_wavefront_monotonic_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_wavefront_monotonic_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm @@ -1492,10 +1926,14 @@ define amdgpu_kernel void @local_wavefront_monotonic_atomicrmw( ; GFX1250-LABEL: local_wavefront_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s0 ; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX1250-NEXT: s_endpgm @@ -1509,133 +1947,183 @@ define amdgpu_kernel void @local_wavefront_acquire_atomicrmw( ; GFX6-LABEL: local_wavefront_acquire_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_wavefront_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_wavefront_acquire_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_acquire_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_acquire_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_wavefront_acquire_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_wavefront_acquire_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_wavefront_acquire_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_wavefront_acquire_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm @@ -1643,10 +2131,14 @@ define amdgpu_kernel void @local_wavefront_acquire_atomicrmw( ; GFX1250-LABEL: local_wavefront_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s0 ; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX1250-NEXT: s_endpgm @@ -1660,133 +2152,183 @@ define amdgpu_kernel void @local_wavefront_release_atomicrmw( ; GFX6-LABEL: local_wavefront_release_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_wavefront_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_wavefront_release_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_release_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_release_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_wavefront_release_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_wavefront_release_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_wavefront_release_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_wavefront_release_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm @@ -1794,10 +2336,14 @@ define amdgpu_kernel void @local_wavefront_release_atomicrmw( ; GFX1250-LABEL: local_wavefront_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s0 ; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX1250-NEXT: s_endpgm @@ -1811,133 +2357,183 @@ define amdgpu_kernel void @local_wavefront_acq_rel_atomicrmw( ; GFX6-LABEL: local_wavefront_acq_rel_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_wavefront_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_wavefront_acq_rel_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_acq_rel_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_wavefront_acq_rel_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_wavefront_acq_rel_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_wavefront_acq_rel_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_wavefront_acq_rel_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm @@ -1945,10 +2541,14 @@ define amdgpu_kernel void @local_wavefront_acq_rel_atomicrmw( ; GFX1250-LABEL: local_wavefront_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s0 ; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX1250-NEXT: s_endpgm @@ -1962,133 +2562,183 @@ define amdgpu_kernel void @local_wavefront_seq_cst_atomicrmw( ; GFX6-LABEL: local_wavefront_seq_cst_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_wavefront_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_wavefront_seq_cst_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_seq_cst_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_wavefront_seq_cst_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_wavefront_seq_cst_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_wavefront_seq_cst_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_wavefront_seq_cst_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm @@ -2096,10 +2746,14 @@ define amdgpu_kernel void @local_wavefront_seq_cst_atomicrmw( ; GFX1250-LABEL: local_wavefront_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s0 ; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX1250-NEXT: s_endpgm @@ -2113,11 +2767,13 @@ define amdgpu_kernel void @local_wavefront_acquire_ret_atomicrmw( ; GFX6-LABEL: local_wavefront_acquire_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 ; GFX6-NEXT: s_mov_b32 m0, -1 @@ -2129,6 +2785,10 @@ define amdgpu_kernel void @local_wavefront_acquire_ret_atomicrmw( ; GFX7-LABEL: local_wavefront_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2144,6 +2804,10 @@ define amdgpu_kernel void @local_wavefront_acquire_ret_atomicrmw( ; GFX10-WGP-LABEL: local_wavefront_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -2157,6 +2821,10 @@ define amdgpu_kernel void @local_wavefront_acquire_ret_atomicrmw( ; GFX10-CU-LABEL: local_wavefront_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -2170,6 +2838,10 @@ define amdgpu_kernel void @local_wavefront_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: local_wavefront_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -2185,6 +2857,10 @@ define amdgpu_kernel void @local_wavefront_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -2198,6 +2874,10 @@ define amdgpu_kernel void @local_wavefront_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-LABEL: local_wavefront_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -2211,6 +2891,10 @@ define amdgpu_kernel void @local_wavefront_acquire_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_acquire_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 @@ -2224,6 +2908,10 @@ define amdgpu_kernel void @local_wavefront_acquire_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: local_wavefront_acquire_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 @@ -2237,6 +2925,10 @@ define amdgpu_kernel void @local_wavefront_acquire_ret_atomicrmw( ; GFX11-WGP-LABEL: local_wavefront_acquire_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -2250,6 +2942,10 @@ define amdgpu_kernel void @local_wavefront_acquire_ret_atomicrmw( ; GFX11-CU-LABEL: local_wavefront_acquire_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2263,6 +2959,10 @@ define amdgpu_kernel void @local_wavefront_acquire_ret_atomicrmw( ; GFX12-WGP-LABEL: local_wavefront_acquire_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -2276,6 +2976,10 @@ define amdgpu_kernel void @local_wavefront_acquire_ret_atomicrmw( ; GFX12-CU-LABEL: local_wavefront_acquire_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2290,6 +2994,10 @@ define amdgpu_kernel void @local_wavefront_acquire_ret_atomicrmw( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 @@ -2310,11 +3018,13 @@ define amdgpu_kernel void @local_wavefront_acq_rel_ret_atomicrmw( ; GFX6-LABEL: local_wavefront_acq_rel_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 ; GFX6-NEXT: s_mov_b32 m0, -1 @@ -2326,6 +3036,10 @@ define amdgpu_kernel void @local_wavefront_acq_rel_ret_atomicrmw( ; GFX7-LABEL: local_wavefront_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2341,6 +3055,10 @@ define amdgpu_kernel void @local_wavefront_acq_rel_ret_atomicrmw( ; GFX10-WGP-LABEL: local_wavefront_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -2354,6 +3072,10 @@ define amdgpu_kernel void @local_wavefront_acq_rel_ret_atomicrmw( ; GFX10-CU-LABEL: local_wavefront_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -2367,6 +3089,10 @@ define amdgpu_kernel void @local_wavefront_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: local_wavefront_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -2382,6 +3108,10 @@ define amdgpu_kernel void @local_wavefront_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -2395,6 +3125,10 @@ define amdgpu_kernel void @local_wavefront_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-LABEL: local_wavefront_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -2408,6 +3142,10 @@ define amdgpu_kernel void @local_wavefront_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 @@ -2421,6 +3159,10 @@ define amdgpu_kernel void @local_wavefront_acq_rel_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: local_wavefront_acq_rel_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 @@ -2434,6 +3176,10 @@ define amdgpu_kernel void @local_wavefront_acq_rel_ret_atomicrmw( ; GFX11-WGP-LABEL: local_wavefront_acq_rel_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -2447,6 +3193,10 @@ define amdgpu_kernel void @local_wavefront_acq_rel_ret_atomicrmw( ; GFX11-CU-LABEL: local_wavefront_acq_rel_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2460,6 +3210,10 @@ define amdgpu_kernel void @local_wavefront_acq_rel_ret_atomicrmw( ; GFX12-WGP-LABEL: local_wavefront_acq_rel_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -2473,6 +3227,10 @@ define amdgpu_kernel void @local_wavefront_acq_rel_ret_atomicrmw( ; GFX12-CU-LABEL: local_wavefront_acq_rel_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2487,6 +3245,10 @@ define amdgpu_kernel void @local_wavefront_acq_rel_ret_atomicrmw( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 @@ -2507,11 +3269,13 @@ define amdgpu_kernel void @local_wavefront_seq_cst_ret_atomicrmw( ; GFX6-LABEL: local_wavefront_seq_cst_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 ; GFX6-NEXT: s_mov_b32 m0, -1 @@ -2523,6 +3287,10 @@ define amdgpu_kernel void @local_wavefront_seq_cst_ret_atomicrmw( ; GFX7-LABEL: local_wavefront_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2538,6 +3306,10 @@ define amdgpu_kernel void @local_wavefront_seq_cst_ret_atomicrmw( ; GFX10-WGP-LABEL: local_wavefront_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -2551,6 +3323,10 @@ define amdgpu_kernel void @local_wavefront_seq_cst_ret_atomicrmw( ; GFX10-CU-LABEL: local_wavefront_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -2564,6 +3340,10 @@ define amdgpu_kernel void @local_wavefront_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: local_wavefront_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -2579,6 +3359,10 @@ define amdgpu_kernel void @local_wavefront_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -2592,6 +3376,10 @@ define amdgpu_kernel void @local_wavefront_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-LABEL: local_wavefront_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -2605,6 +3393,10 @@ define amdgpu_kernel void @local_wavefront_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 @@ -2618,6 +3410,10 @@ define amdgpu_kernel void @local_wavefront_seq_cst_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: local_wavefront_seq_cst_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 @@ -2631,6 +3427,10 @@ define amdgpu_kernel void @local_wavefront_seq_cst_ret_atomicrmw( ; GFX11-WGP-LABEL: local_wavefront_seq_cst_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -2644,6 +3444,10 @@ define amdgpu_kernel void @local_wavefront_seq_cst_ret_atomicrmw( ; GFX11-CU-LABEL: local_wavefront_seq_cst_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2657,6 +3461,10 @@ define amdgpu_kernel void @local_wavefront_seq_cst_ret_atomicrmw( ; GFX12-WGP-LABEL: local_wavefront_seq_cst_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -2670,6 +3478,10 @@ define amdgpu_kernel void @local_wavefront_seq_cst_ret_atomicrmw( ; GFX12-CU-LABEL: local_wavefront_seq_cst_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2684,6 +3496,10 @@ define amdgpu_kernel void @local_wavefront_seq_cst_ret_atomicrmw( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 @@ -2704,12 +3520,16 @@ define amdgpu_kernel void @local_wavefront_monotonic_monotonic_cmpxchg( ; GFX6-LABEL: local_wavefront_monotonic_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -2717,12 +3537,18 @@ define amdgpu_kernel void @local_wavefront_monotonic_monotonic_cmpxchg( ; ; GFX7-LABEL: local_wavefront_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -2730,11 +3556,17 @@ define amdgpu_kernel void @local_wavefront_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -2742,11 +3574,17 @@ define amdgpu_kernel void @local_wavefront_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -2754,12 +3592,18 @@ define amdgpu_kernel void @local_wavefront_monotonic_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -2767,11 +3611,17 @@ define amdgpu_kernel void @local_wavefront_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -2779,11 +3629,17 @@ define amdgpu_kernel void @local_wavefront_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -2791,11 +3647,17 @@ define amdgpu_kernel void @local_wavefront_monotonic_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_monotonic_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -2803,11 +3665,17 @@ define amdgpu_kernel void @local_wavefront_monotonic_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_wavefront_monotonic_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -2815,48 +3683,72 @@ define amdgpu_kernel void @local_wavefront_monotonic_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_wavefront_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_wavefront_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_wavefront_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -2864,12 +3756,18 @@ define amdgpu_kernel void @local_wavefront_monotonic_monotonic_cmpxchg( ; GFX1250-LABEL: local_wavefront_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -2884,12 +3782,16 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_cmpxchg( ; GFX6-LABEL: local_wavefront_acquire_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -2897,12 +3799,18 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_cmpxchg( ; ; GFX7-LABEL: local_wavefront_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -2910,11 +3818,17 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -2922,11 +3836,17 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -2934,12 +3854,18 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -2947,11 +3873,17 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -2959,11 +3891,17 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -2971,11 +3909,17 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_acquire_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -2983,11 +3927,17 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_wavefront_acquire_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -2995,48 +3945,72 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_wavefront_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_wavefront_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_wavefront_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -3044,12 +4018,18 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_cmpxchg( ; GFX1250-LABEL: local_wavefront_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -3064,12 +4044,16 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_cmpxchg( ; GFX6-LABEL: local_wavefront_release_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3077,12 +4061,18 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_cmpxchg( ; ; GFX7-LABEL: local_wavefront_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3090,11 +4080,17 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3102,11 +4098,17 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3114,12 +4116,18 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3127,11 +4135,17 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3139,11 +4153,17 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3151,11 +4171,17 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_release_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3163,11 +4189,17 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_wavefront_release_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3175,48 +4207,72 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_wavefront_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_wavefront_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_wavefront_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -3224,12 +4280,18 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_cmpxchg( ; GFX1250-LABEL: local_wavefront_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -3244,12 +4306,16 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_cmpxchg( ; GFX6-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3257,12 +4323,18 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_cmpxchg( ; ; GFX7-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3270,11 +4342,17 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3282,11 +4360,17 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3294,12 +4378,18 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3307,11 +4397,17 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3319,11 +4415,17 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3331,11 +4433,17 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3343,11 +4451,17 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3355,48 +4469,72 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -3404,12 +4542,18 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_cmpxchg( ; GFX1250-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -3424,12 +4568,16 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_cmpxchg( ; GFX6-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3437,12 +4585,18 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_cmpxchg( ; ; GFX7-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3450,11 +4604,17 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3462,11 +4622,17 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3474,12 +4640,18 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3487,11 +4659,17 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3499,11 +4677,17 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3511,11 +4695,17 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3523,11 +4713,17 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3535,48 +4731,72 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -3584,12 +4804,18 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_cmpxchg( ; GFX1250-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -3604,12 +4830,16 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_cmpxchg( ; GFX6-LABEL: local_wavefront_monotonic_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3617,12 +4847,18 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_cmpxchg( ; ; GFX7-LABEL: local_wavefront_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3630,11 +4866,17 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3642,11 +4884,17 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3654,12 +4902,18 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3667,11 +4921,17 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3679,11 +4939,17 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3691,11 +4957,17 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_monotonic_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3703,11 +4975,17 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_wavefront_monotonic_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3715,48 +4993,72 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_wavefront_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_wavefront_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_wavefront_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -3764,12 +5066,18 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_cmpxchg( ; GFX1250-LABEL: local_wavefront_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -3784,12 +5092,16 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_cmpxchg( ; GFX6-LABEL: local_wavefront_acquire_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3797,12 +5109,18 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_cmpxchg( ; ; GFX7-LABEL: local_wavefront_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3810,11 +5128,17 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3822,11 +5146,17 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3834,12 +5164,18 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3847,11 +5183,17 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3859,11 +5201,17 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3871,11 +5219,17 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_acquire_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3883,11 +5237,17 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_wavefront_acquire_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3895,48 +5255,72 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_wavefront_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_wavefront_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_wavefront_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -3944,12 +5328,18 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_cmpxchg( ; GFX1250-LABEL: local_wavefront_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -3964,12 +5354,16 @@ define amdgpu_kernel void @local_wavefront_release_acquire_cmpxchg( ; GFX6-LABEL: local_wavefront_release_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3977,12 +5371,18 @@ define amdgpu_kernel void @local_wavefront_release_acquire_cmpxchg( ; ; GFX7-LABEL: local_wavefront_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3990,11 +5390,17 @@ define amdgpu_kernel void @local_wavefront_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4002,11 +5408,17 @@ define amdgpu_kernel void @local_wavefront_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4014,12 +5426,18 @@ define amdgpu_kernel void @local_wavefront_release_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4027,11 +5445,17 @@ define amdgpu_kernel void @local_wavefront_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4039,11 +5463,17 @@ define amdgpu_kernel void @local_wavefront_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4051,11 +5481,17 @@ define amdgpu_kernel void @local_wavefront_release_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_release_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4063,11 +5499,17 @@ define amdgpu_kernel void @local_wavefront_release_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_wavefront_release_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4075,48 +5517,72 @@ define amdgpu_kernel void @local_wavefront_release_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_wavefront_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_wavefront_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_wavefront_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -4124,12 +5590,18 @@ define amdgpu_kernel void @local_wavefront_release_acquire_cmpxchg( ; GFX1250-LABEL: local_wavefront_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -4144,12 +5616,16 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_cmpxchg( ; GFX6-LABEL: local_wavefront_acq_rel_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4157,12 +5633,18 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_cmpxchg( ; ; GFX7-LABEL: local_wavefront_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4170,11 +5652,17 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4182,11 +5670,17 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4194,12 +5688,18 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4207,11 +5707,17 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4219,11 +5725,17 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4231,11 +5743,17 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4243,11 +5761,17 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_wavefront_acq_rel_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4255,48 +5779,72 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_wavefront_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_wavefront_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_wavefront_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -4304,12 +5852,18 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_cmpxchg( ; GFX1250-LABEL: local_wavefront_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -4324,12 +5878,16 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_cmpxchg( ; GFX6-LABEL: local_wavefront_seq_cst_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4337,12 +5895,18 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_cmpxchg( ; ; GFX7-LABEL: local_wavefront_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4350,11 +5914,17 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4362,11 +5932,17 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4374,12 +5950,18 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4387,11 +5969,17 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4399,11 +5987,17 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4411,11 +6005,17 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4423,11 +6023,17 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_wavefront_seq_cst_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4435,48 +6041,72 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_wavefront_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_wavefront_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_wavefront_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -4484,12 +6114,18 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_cmpxchg( ; GFX1250-LABEL: local_wavefront_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -4504,12 +6140,16 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_cmpxchg( ; GFX6-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4517,12 +6157,18 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4530,11 +6176,17 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4542,11 +6194,17 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4554,12 +6212,18 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4567,11 +6231,17 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4579,11 +6249,17 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4591,11 +6267,17 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4603,11 +6285,17 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4615,48 +6303,72 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -4664,12 +6376,18 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_cmpxchg( ; GFX1250-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -4684,12 +6402,16 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_cmpxchg( ; GFX6-LABEL: local_wavefront_acquire_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4697,12 +6419,18 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_wavefront_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4710,11 +6438,17 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4722,11 +6456,17 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4734,12 +6474,18 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4747,11 +6493,17 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4759,11 +6511,17 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4771,11 +6529,17 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_acquire_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4783,11 +6547,17 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_wavefront_acquire_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4795,48 +6565,72 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_wavefront_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_wavefront_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_wavefront_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -4844,12 +6638,18 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_cmpxchg( ; GFX1250-LABEL: local_wavefront_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -4864,12 +6664,16 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_cmpxchg( ; GFX6-LABEL: local_wavefront_release_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4877,12 +6681,18 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_wavefront_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4890,11 +6700,17 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4902,11 +6718,17 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4914,12 +6736,18 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4927,11 +6755,17 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4939,11 +6773,17 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4951,11 +6791,17 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_release_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4963,11 +6809,17 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_wavefront_release_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4975,48 +6827,72 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_wavefront_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_wavefront_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_wavefront_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -5024,12 +6900,18 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_cmpxchg( ; GFX1250-LABEL: local_wavefront_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -5044,12 +6926,16 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX6-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -5057,12 +6943,18 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -5070,11 +6962,17 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -5082,11 +6980,17 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -5094,12 +6998,18 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -5107,11 +7017,17 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -5119,11 +7035,17 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -5131,11 +7053,17 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -5143,11 +7071,17 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -5155,48 +7089,72 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -5204,12 +7162,18 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX1250-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -5224,12 +7188,16 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX6-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -5237,12 +7205,18 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -5250,11 +7224,17 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -5262,11 +7242,17 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -5274,12 +7260,18 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -5287,11 +7279,17 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -5299,11 +7297,17 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -5311,11 +7315,17 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -5323,11 +7333,17 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -5335,48 +7351,72 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -5384,12 +7424,18 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX1250-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -5404,12 +7450,16 @@ define amdgpu_kernel void @local_wavefront_monotonic_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -5422,6 +7472,12 @@ define amdgpu_kernel void @local_wavefront_monotonic_monotonic_ret_cmpxchg( ; GFX7-LABEL: local_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -5439,6 +7495,12 @@ define amdgpu_kernel void @local_wavefront_monotonic_monotonic_ret_cmpxchg( ; GFX10-WGP-LABEL: local_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5454,6 +7516,12 @@ define amdgpu_kernel void @local_wavefront_monotonic_monotonic_ret_cmpxchg( ; GFX10-CU-LABEL: local_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5469,6 +7537,12 @@ define amdgpu_kernel void @local_wavefront_monotonic_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_wavefront_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -5486,6 +7560,12 @@ define amdgpu_kernel void @local_wavefront_monotonic_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5501,6 +7581,12 @@ define amdgpu_kernel void @local_wavefront_monotonic_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5516,6 +7602,12 @@ define amdgpu_kernel void @local_wavefront_monotonic_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5531,6 +7623,12 @@ define amdgpu_kernel void @local_wavefront_monotonic_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5546,6 +7644,12 @@ define amdgpu_kernel void @local_wavefront_monotonic_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: local_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5561,6 +7665,12 @@ define amdgpu_kernel void @local_wavefront_monotonic_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: local_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5576,6 +7686,12 @@ define amdgpu_kernel void @local_wavefront_monotonic_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -5591,6 +7707,12 @@ define amdgpu_kernel void @local_wavefront_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -5607,6 +7729,12 @@ define amdgpu_kernel void @local_wavefront_monotonic_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -5631,12 +7759,16 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -5649,6 +7781,12 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX7-LABEL: local_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -5666,6 +7804,12 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX10-WGP-LABEL: local_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5681,6 +7825,12 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX10-CU-LABEL: local_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5696,6 +7846,12 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_wavefront_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -5713,6 +7869,12 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5728,6 +7890,12 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5743,6 +7911,12 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5758,6 +7932,12 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5773,6 +7953,12 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: local_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5788,6 +7974,12 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: local_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5803,6 +7995,12 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -5818,6 +8016,12 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -5834,6 +8038,12 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -5858,12 +8068,16 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_wavefront_release_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -5876,6 +8090,12 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_ret_cmpxchg( ; GFX7-LABEL: local_wavefront_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -5893,6 +8113,12 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_ret_cmpxchg( ; GFX10-WGP-LABEL: local_wavefront_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5908,6 +8134,12 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_ret_cmpxchg( ; GFX10-CU-LABEL: local_wavefront_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5923,6 +8155,12 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_wavefront_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -5940,6 +8178,12 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5955,6 +8199,12 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_wavefront_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5970,6 +8220,12 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_release_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5985,6 +8241,12 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_wavefront_release_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6000,6 +8262,12 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: local_wavefront_release_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -6015,6 +8283,12 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: local_wavefront_release_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6030,6 +8304,12 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_wavefront_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -6045,6 +8325,12 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_wavefront_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -6061,6 +8347,12 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -6085,12 +8377,16 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -6103,6 +8399,12 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX7-LABEL: local_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -6120,6 +8422,12 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX10-WGP-LABEL: local_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -6135,6 +8443,12 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX10-CU-LABEL: local_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6150,6 +8464,12 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_wavefront_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -6167,6 +8487,12 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6182,6 +8508,12 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6197,6 +8529,12 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6212,6 +8550,12 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6227,6 +8571,12 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: local_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -6242,6 +8592,12 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: local_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6257,6 +8613,12 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -6272,6 +8634,12 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -6288,6 +8656,12 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -6312,12 +8686,16 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -6330,6 +8708,12 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX7-LABEL: local_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -6347,6 +8731,12 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX10-WGP-LABEL: local_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -6362,6 +8752,12 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX10-CU-LABEL: local_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6377,6 +8773,12 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_wavefront_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -6394,6 +8796,12 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6409,6 +8817,12 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6424,6 +8838,12 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6439,6 +8859,12 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6454,6 +8880,12 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: local_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -6469,6 +8901,12 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: local_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6484,6 +8922,12 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -6499,6 +8943,12 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -6515,6 +8965,12 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -6539,12 +8995,16 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX6-LABEL: local_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -6557,6 +9017,12 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX7-LABEL: local_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -6574,6 +9040,12 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: local_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -6589,6 +9061,12 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX10-CU-LABEL: local_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6604,6 +9082,12 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_wavefront_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -6621,6 +9105,12 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6636,6 +9126,12 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6651,6 +9147,12 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6666,6 +9168,12 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6681,6 +9189,12 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: local_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -6696,6 +9210,12 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: local_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6711,6 +9231,12 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -6726,6 +9252,12 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -6742,6 +9274,12 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -6766,12 +9304,16 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_ret_cmpxchg( ; GFX6-LABEL: local_wavefront_acquire_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -6784,6 +9326,12 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_ret_cmpxchg( ; GFX7-LABEL: local_wavefront_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -6801,6 +9349,12 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: local_wavefront_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -6816,6 +9370,12 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_ret_cmpxchg( ; GFX10-CU-LABEL: local_wavefront_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6831,6 +9391,12 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_wavefront_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -6848,6 +9414,12 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6863,6 +9435,12 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_wavefront_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6878,6 +9456,12 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_acquire_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6893,6 +9477,12 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_wavefront_acquire_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6908,6 +9498,12 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: local_wavefront_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -6923,6 +9519,12 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: local_wavefront_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6938,6 +9540,12 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_wavefront_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -6953,6 +9561,12 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_wavefront_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -6969,6 +9583,12 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -6993,12 +9613,16 @@ define amdgpu_kernel void @local_wavefront_release_acquire_ret_cmpxchg( ; GFX6-LABEL: local_wavefront_release_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -7011,6 +9635,12 @@ define amdgpu_kernel void @local_wavefront_release_acquire_ret_cmpxchg( ; GFX7-LABEL: local_wavefront_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -7028,6 +9658,12 @@ define amdgpu_kernel void @local_wavefront_release_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: local_wavefront_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7043,6 +9679,12 @@ define amdgpu_kernel void @local_wavefront_release_acquire_ret_cmpxchg( ; GFX10-CU-LABEL: local_wavefront_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -7058,6 +9700,12 @@ define amdgpu_kernel void @local_wavefront_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_wavefront_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -7075,6 +9723,12 @@ define amdgpu_kernel void @local_wavefront_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7090,6 +9744,12 @@ define amdgpu_kernel void @local_wavefront_release_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_wavefront_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7105,6 +9765,12 @@ define amdgpu_kernel void @local_wavefront_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_release_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7120,6 +9786,12 @@ define amdgpu_kernel void @local_wavefront_release_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_wavefront_release_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7135,6 +9807,12 @@ define amdgpu_kernel void @local_wavefront_release_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: local_wavefront_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7150,6 +9828,12 @@ define amdgpu_kernel void @local_wavefront_release_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: local_wavefront_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -7165,6 +9849,12 @@ define amdgpu_kernel void @local_wavefront_release_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_wavefront_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -7180,6 +9870,12 @@ define amdgpu_kernel void @local_wavefront_release_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_wavefront_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -7196,6 +9892,12 @@ define amdgpu_kernel void @local_wavefront_release_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -7220,12 +9922,16 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX6-LABEL: local_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -7238,6 +9944,12 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX7-LABEL: local_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -7255,6 +9967,12 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: local_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7270,6 +9988,12 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX10-CU-LABEL: local_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -7285,6 +10009,12 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_wavefront_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -7302,6 +10032,12 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7317,6 +10053,12 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7332,6 +10074,12 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7347,6 +10095,12 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7362,6 +10116,12 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: local_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7377,6 +10137,12 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: local_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -7392,6 +10158,12 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -7407,6 +10179,12 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -7423,6 +10201,12 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -7447,12 +10231,16 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX6-LABEL: local_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -7465,6 +10253,12 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX7-LABEL: local_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -7482,6 +10276,12 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: local_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7497,6 +10297,12 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX10-CU-LABEL: local_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -7512,6 +10318,12 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_wavefront_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -7529,6 +10341,12 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7544,6 +10362,12 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7559,6 +10383,12 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7574,6 +10404,12 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7589,6 +10425,12 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: local_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7604,6 +10446,12 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: local_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -7619,6 +10467,12 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -7634,6 +10488,12 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -7650,6 +10510,12 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -7674,12 +10540,16 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -7692,6 +10562,12 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX7-LABEL: local_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -7709,6 +10585,12 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX10-WGP-LABEL: local_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7724,6 +10606,12 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX10-CU-LABEL: local_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -7739,6 +10627,12 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_wavefront_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -7756,6 +10650,12 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7771,6 +10671,12 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7786,6 +10692,12 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7801,6 +10713,12 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7816,6 +10734,12 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: local_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7831,6 +10755,12 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: local_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -7846,6 +10776,12 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -7861,6 +10797,12 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -7877,6 +10819,12 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -7901,12 +10849,16 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -7919,6 +10871,12 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX7-LABEL: local_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -7936,6 +10894,12 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX10-WGP-LABEL: local_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7951,6 +10915,12 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX10-CU-LABEL: local_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -7966,6 +10936,12 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_wavefront_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -7983,6 +10959,12 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7998,6 +10980,12 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8013,6 +11001,12 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8028,6 +11022,12 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8043,6 +11043,12 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: local_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -8058,6 +11064,12 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: local_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -8073,6 +11085,12 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -8088,6 +11106,12 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -8104,6 +11128,12 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -8128,12 +11158,16 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_wavefront_release_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -8146,6 +11180,12 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_ret_cmpxchg( ; GFX7-LABEL: local_wavefront_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -8163,6 +11203,12 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_ret_cmpxchg( ; GFX10-WGP-LABEL: local_wavefront_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -8178,6 +11224,12 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_ret_cmpxchg( ; GFX10-CU-LABEL: local_wavefront_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -8193,6 +11245,12 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_wavefront_release_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -8210,6 +11268,12 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8225,6 +11289,12 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_wavefront_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8240,6 +11310,12 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_release_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8255,6 +11331,12 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_wavefront_release_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8270,6 +11352,12 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: local_wavefront_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -8285,6 +11373,12 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: local_wavefront_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -8300,6 +11394,12 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_wavefront_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -8315,6 +11415,12 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_wavefront_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -8331,6 +11437,12 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -8355,12 +11467,16 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -8373,6 +11489,12 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-LABEL: local_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -8390,6 +11512,12 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-WGP-LABEL: local_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -8405,6 +11533,12 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-CU-LABEL: local_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -8420,6 +11554,12 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -8437,6 +11577,12 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8452,6 +11598,12 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8467,6 +11619,12 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8482,6 +11640,12 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8497,6 +11661,12 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: local_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -8512,6 +11682,12 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: local_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -8527,6 +11703,12 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -8542,6 +11724,12 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -8558,6 +11746,12 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -8582,12 +11776,16 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -8600,6 +11798,12 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-LABEL: local_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -8617,6 +11821,12 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-WGP-LABEL: local_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -8632,6 +11842,12 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-CU-LABEL: local_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -8647,6 +11863,12 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -8664,6 +11886,12 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8679,6 +11907,12 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8694,6 +11928,12 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8709,6 +11949,12 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8724,6 +11970,12 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: local_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -8739,6 +11991,12 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: local_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -8754,6 +12012,12 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -8769,6 +12033,12 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -8785,6 +12055,12 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -8824,135 +12100,169 @@ define amdgpu_kernel void @local_wavefront_one_as_unordered_load( ; ; GFX7-LABEL: local_wavefront_one_as_unordered_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: ds_read_b32 v1, v0 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_one_as_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_wavefront_one_as_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_unordered_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_one_as_unordered_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_unordered_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_wavefront_one_as_unordered_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_wavefront_one_as_unordered_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_wavefront_one_as_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: ds_load_b32 v1, v0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: ds_store_b32 v0, v1 @@ -8960,11 +12270,15 @@ define amdgpu_kernel void @local_wavefront_one_as_unordered_load( ; ; GFX12-CU-LABEL: local_wavefront_one_as_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 @@ -8973,11 +12287,15 @@ define amdgpu_kernel void @local_wavefront_one_as_unordered_load( ; GFX1250-LABEL: local_wavefront_one_as_unordered_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: ds_store_b32 v0, v1 @@ -9008,135 +12326,169 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_load( ; ; GFX7-LABEL: local_wavefront_one_as_monotonic_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: ds_read_b32 v1, v0 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_one_as_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_wavefront_one_as_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_monotonic_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_wavefront_one_as_monotonic_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_wavefront_one_as_monotonic_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_wavefront_one_as_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: ds_load_b32 v1, v0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: ds_store_b32 v0, v1 @@ -9144,11 +12496,15 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_load( ; ; GFX12-CU-LABEL: local_wavefront_one_as_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 @@ -9157,11 +12513,15 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_load( ; GFX1250-LABEL: local_wavefront_one_as_monotonic_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: ds_store_b32 v0, v1 @@ -9192,135 +12552,169 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_load( ; ; GFX7-LABEL: local_wavefront_one_as_acquire_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: ds_read_b32 v1, v0 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_one_as_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_wavefront_one_as_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acquire_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_acquire_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_wavefront_one_as_acquire_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_wavefront_one_as_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_wavefront_one_as_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: ds_load_b32 v1, v0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: ds_store_b32 v0, v1 @@ -9328,11 +12722,15 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_load( ; ; GFX12-CU-LABEL: local_wavefront_one_as_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 @@ -9341,11 +12739,15 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_load( ; GFX1250-LABEL: local_wavefront_one_as_acquire_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: ds_store_b32 v0, v1 @@ -9376,135 +12778,169 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_load( ; ; GFX7-LABEL: local_wavefront_one_as_seq_cst_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: ds_read_b32 v1, v0 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_one_as_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_wavefront_one_as_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_seq_cst_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_wavefront_one_as_seq_cst_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_wavefront_one_as_seq_cst_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_wavefront_one_as_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: ds_load_b32 v1, v0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: ds_store_b32 v0, v1 @@ -9512,11 +12948,15 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_load( ; ; GFX12-CU-LABEL: local_wavefront_one_as_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 @@ -9525,11 +12965,15 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_load( ; GFX1250-LABEL: local_wavefront_one_as_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: ds_store_b32 v0, v1 @@ -9544,12 +12988,14 @@ entry: define amdgpu_kernel void @local_wavefront_one_as_unordered_store( ; GFX6-LABEL: local_wavefront_one_as_unordered_store: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm @@ -9557,6 +13003,10 @@ define amdgpu_kernel void @local_wavefront_one_as_unordered_store( ; GFX7-LABEL: local_wavefront_one_as_unordered_store: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -9568,6 +13018,10 @@ define amdgpu_kernel void @local_wavefront_one_as_unordered_store( ; GFX10-WGP-LABEL: local_wavefront_one_as_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 @@ -9578,6 +13032,10 @@ define amdgpu_kernel void @local_wavefront_one_as_unordered_store( ; GFX10-CU-LABEL: local_wavefront_one_as_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 @@ -9588,6 +13046,10 @@ define amdgpu_kernel void @local_wavefront_one_as_unordered_store( ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_unordered_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -9599,6 +13061,10 @@ define amdgpu_kernel void @local_wavefront_one_as_unordered_store( ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -9609,6 +13075,10 @@ define amdgpu_kernel void @local_wavefront_one_as_unordered_store( ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -9619,6 +13089,10 @@ define amdgpu_kernel void @local_wavefront_one_as_unordered_store( ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_one_as_unordered_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -9629,6 +13103,10 @@ define amdgpu_kernel void @local_wavefront_one_as_unordered_store( ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_unordered_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -9639,6 +13117,10 @@ define amdgpu_kernel void @local_wavefront_one_as_unordered_store( ; GFX11-WGP-LABEL: local_wavefront_one_as_unordered_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -9649,6 +13131,10 @@ define amdgpu_kernel void @local_wavefront_one_as_unordered_store( ; GFX11-CU-LABEL: local_wavefront_one_as_unordered_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -9659,6 +13145,10 @@ define amdgpu_kernel void @local_wavefront_one_as_unordered_store( ; GFX12-WGP-LABEL: local_wavefront_one_as_unordered_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -9669,6 +13159,10 @@ define amdgpu_kernel void @local_wavefront_one_as_unordered_store( ; GFX12-CU-LABEL: local_wavefront_one_as_unordered_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -9680,6 +13174,10 @@ define amdgpu_kernel void @local_wavefront_one_as_unordered_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 @@ -9695,12 +13193,14 @@ entry: define amdgpu_kernel void @local_wavefront_one_as_monotonic_store( ; GFX6-LABEL: local_wavefront_one_as_monotonic_store: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm @@ -9708,6 +13208,10 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_store( ; GFX7-LABEL: local_wavefront_one_as_monotonic_store: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -9719,6 +13223,10 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_store( ; GFX10-WGP-LABEL: local_wavefront_one_as_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 @@ -9729,6 +13237,10 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_store( ; GFX10-CU-LABEL: local_wavefront_one_as_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 @@ -9739,6 +13251,10 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_store( ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_monotonic_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -9750,6 +13266,10 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_store( ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -9760,6 +13280,10 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_store( ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -9770,6 +13294,10 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_store( ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -9780,6 +13308,10 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_store( ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -9790,6 +13322,10 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_store( ; GFX11-WGP-LABEL: local_wavefront_one_as_monotonic_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -9800,6 +13336,10 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_store( ; GFX11-CU-LABEL: local_wavefront_one_as_monotonic_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -9810,6 +13350,10 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_store( ; GFX12-WGP-LABEL: local_wavefront_one_as_monotonic_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -9820,6 +13364,10 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_store( ; GFX12-CU-LABEL: local_wavefront_one_as_monotonic_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -9831,6 +13379,10 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 @@ -9846,12 +13398,14 @@ entry: define amdgpu_kernel void @local_wavefront_one_as_release_store( ; GFX6-LABEL: local_wavefront_one_as_release_store: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm @@ -9859,6 +13413,10 @@ define amdgpu_kernel void @local_wavefront_one_as_release_store( ; GFX7-LABEL: local_wavefront_one_as_release_store: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -9870,6 +13428,10 @@ define amdgpu_kernel void @local_wavefront_one_as_release_store( ; GFX10-WGP-LABEL: local_wavefront_one_as_release_store: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 @@ -9880,6 +13442,10 @@ define amdgpu_kernel void @local_wavefront_one_as_release_store( ; GFX10-CU-LABEL: local_wavefront_one_as_release_store: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 @@ -9890,6 +13456,10 @@ define amdgpu_kernel void @local_wavefront_one_as_release_store( ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_release_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -9901,6 +13471,10 @@ define amdgpu_kernel void @local_wavefront_one_as_release_store( ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -9911,6 +13485,10 @@ define amdgpu_kernel void @local_wavefront_one_as_release_store( ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -9921,6 +13499,10 @@ define amdgpu_kernel void @local_wavefront_one_as_release_store( ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -9931,6 +13513,10 @@ define amdgpu_kernel void @local_wavefront_one_as_release_store( ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_release_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -9941,6 +13527,10 @@ define amdgpu_kernel void @local_wavefront_one_as_release_store( ; GFX11-WGP-LABEL: local_wavefront_one_as_release_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -9951,6 +13541,10 @@ define amdgpu_kernel void @local_wavefront_one_as_release_store( ; GFX11-CU-LABEL: local_wavefront_one_as_release_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -9961,6 +13555,10 @@ define amdgpu_kernel void @local_wavefront_one_as_release_store( ; GFX12-WGP-LABEL: local_wavefront_one_as_release_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -9971,6 +13569,10 @@ define amdgpu_kernel void @local_wavefront_one_as_release_store( ; GFX12-CU-LABEL: local_wavefront_one_as_release_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -9982,6 +13584,10 @@ define amdgpu_kernel void @local_wavefront_one_as_release_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 @@ -9997,12 +13603,14 @@ entry: define amdgpu_kernel void @local_wavefront_one_as_seq_cst_store( ; GFX6-LABEL: local_wavefront_one_as_seq_cst_store: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm @@ -10010,6 +13618,10 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_store( ; GFX7-LABEL: local_wavefront_one_as_seq_cst_store: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10021,6 +13633,10 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_store( ; GFX10-WGP-LABEL: local_wavefront_one_as_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 @@ -10031,6 +13647,10 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_store( ; GFX10-CU-LABEL: local_wavefront_one_as_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 @@ -10041,6 +13661,10 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_store( ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_seq_cst_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -10052,6 +13676,10 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_store( ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -10062,6 +13690,10 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_store( ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -10072,6 +13704,10 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_store( ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -10082,6 +13718,10 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_store( ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -10092,6 +13732,10 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_store( ; GFX11-WGP-LABEL: local_wavefront_one_as_seq_cst_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -10102,6 +13746,10 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_store( ; GFX11-CU-LABEL: local_wavefront_one_as_seq_cst_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -10112,6 +13760,10 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_store( ; GFX12-WGP-LABEL: local_wavefront_one_as_seq_cst_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -10122,6 +13774,10 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_store( ; GFX12-CU-LABEL: local_wavefront_one_as_seq_cst_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -10133,6 +13789,10 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 @@ -10149,133 +13809,183 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_atomicrmw( ; GFX6-LABEL: local_wavefront_one_as_monotonic_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_one_as_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_one_as_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_wavefront_one_as_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_monotonic_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_wavefront_one_as_monotonic_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_wavefront_one_as_monotonic_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_wavefront_one_as_monotonic_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_wavefront_one_as_monotonic_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm @@ -10283,10 +13993,14 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_atomicrmw( ; GFX1250-LABEL: local_wavefront_one_as_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s0 ; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX1250-NEXT: s_endpgm @@ -10300,133 +14014,183 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_atomicrmw( ; GFX6-LABEL: local_wavefront_one_as_acquire_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_one_as_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_one_as_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_wavefront_one_as_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acquire_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_acquire_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_wavefront_one_as_acquire_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_wavefront_one_as_acquire_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_wavefront_one_as_acquire_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_wavefront_one_as_acquire_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm @@ -10434,10 +14198,14 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_atomicrmw( ; GFX1250-LABEL: local_wavefront_one_as_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s0 ; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX1250-NEXT: s_endpgm @@ -10451,133 +14219,183 @@ define amdgpu_kernel void @local_wavefront_one_as_release_atomicrmw( ; GFX6-LABEL: local_wavefront_one_as_release_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_one_as_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_one_as_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_wavefront_one_as_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_release_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_release_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_wavefront_one_as_release_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_wavefront_one_as_release_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_wavefront_one_as_release_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_wavefront_one_as_release_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm @@ -10585,10 +14403,14 @@ define amdgpu_kernel void @local_wavefront_one_as_release_atomicrmw( ; GFX1250-LABEL: local_wavefront_one_as_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s0 ; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX1250-NEXT: s_endpgm @@ -10602,133 +14424,183 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_atomicrmw( ; GFX6-LABEL: local_wavefront_one_as_acq_rel_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_one_as_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_one_as_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_wavefront_one_as_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acq_rel_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_wavefront_one_as_acq_rel_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_wavefront_one_as_acq_rel_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_wavefront_one_as_acq_rel_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_wavefront_one_as_acq_rel_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm @@ -10736,10 +14608,14 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_atomicrmw( ; GFX1250-LABEL: local_wavefront_one_as_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s0 ; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX1250-NEXT: s_endpgm @@ -10753,133 +14629,183 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_atomicrmw( ; GFX6-LABEL: local_wavefront_one_as_seq_cst_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_one_as_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_one_as_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_wavefront_one_as_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_seq_cst_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_wavefront_one_as_seq_cst_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_wavefront_one_as_seq_cst_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_wavefront_one_as_seq_cst_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_wavefront_one_as_seq_cst_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm @@ -10887,10 +14813,14 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_atomicrmw( ; GFX1250-LABEL: local_wavefront_one_as_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s0 ; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX1250-NEXT: s_endpgm @@ -10904,11 +14834,13 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_ret_atomicrmw( ; GFX6-LABEL: local_wavefront_one_as_acquire_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 ; GFX6-NEXT: s_mov_b32 m0, -1 @@ -10920,6 +14852,10 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_ret_atomicrmw( ; GFX7-LABEL: local_wavefront_one_as_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10935,6 +14871,10 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_ret_atomicrmw( ; GFX10-WGP-LABEL: local_wavefront_one_as_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -10948,6 +14888,10 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_ret_atomicrmw( ; GFX10-CU-LABEL: local_wavefront_one_as_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -10961,6 +14905,10 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -10976,6 +14924,10 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -10989,6 +14941,10 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -11002,6 +14958,10 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 @@ -11015,6 +14975,10 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_acquire_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 @@ -11028,6 +14992,10 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_ret_atomicrmw( ; GFX11-WGP-LABEL: local_wavefront_one_as_acquire_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -11041,6 +15009,10 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_ret_atomicrmw( ; GFX11-CU-LABEL: local_wavefront_one_as_acquire_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -11054,6 +15026,10 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_ret_atomicrmw( ; GFX12-WGP-LABEL: local_wavefront_one_as_acquire_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -11067,6 +15043,10 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_ret_atomicrmw( ; GFX12-CU-LABEL: local_wavefront_one_as_acquire_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -11081,6 +15061,10 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_ret_atomicrmw( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 @@ -11101,11 +15085,13 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX6-LABEL: local_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 ; GFX6-NEXT: s_mov_b32 m0, -1 @@ -11117,6 +15103,10 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX7-LABEL: local_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11132,6 +15122,10 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX10-WGP-LABEL: local_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -11145,6 +15139,10 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX10-CU-LABEL: local_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -11158,6 +15156,10 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -11173,6 +15175,10 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -11186,6 +15192,10 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -11199,6 +15209,10 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 @@ -11212,6 +15226,10 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 @@ -11225,6 +15243,10 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX11-WGP-LABEL: local_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -11238,6 +15260,10 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX11-CU-LABEL: local_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -11251,6 +15277,10 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX12-WGP-LABEL: local_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -11264,6 +15294,10 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-LABEL: local_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -11278,6 +15312,10 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 @@ -11298,11 +15336,13 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX6-LABEL: local_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 ; GFX6-NEXT: s_mov_b32 m0, -1 @@ -11314,6 +15354,10 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX7-LABEL: local_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11329,6 +15373,10 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX10-WGP-LABEL: local_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -11342,6 +15390,10 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX10-CU-LABEL: local_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -11355,6 +15407,10 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -11370,6 +15426,10 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -11383,6 +15443,10 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -11396,6 +15460,10 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 @@ -11409,6 +15477,10 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 @@ -11422,6 +15494,10 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX11-WGP-LABEL: local_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -11435,6 +15511,10 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX11-CU-LABEL: local_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -11448,6 +15528,10 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX12-WGP-LABEL: local_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -11461,6 +15545,10 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-LABEL: local_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -11475,6 +15563,10 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 @@ -11495,12 +15587,16 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_cmpxchg( ; GFX6-LABEL: local_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -11508,12 +15604,18 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_cmpxchg( ; ; GFX7-LABEL: local_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -11521,11 +15623,17 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -11533,11 +15641,17 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -11545,12 +15659,18 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -11558,11 +15678,17 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -11570,11 +15696,17 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -11582,11 +15714,17 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -11594,11 +15732,17 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -11606,48 +15750,72 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -11655,12 +15823,18 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_cmpxchg( ; GFX1250-LABEL: local_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -11675,12 +15849,16 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX6-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -11688,12 +15866,18 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_cmpxchg( ; ; GFX7-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -11701,11 +15885,17 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -11713,11 +15903,17 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -11725,12 +15921,18 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -11738,11 +15940,17 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -11750,11 +15958,17 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -11762,11 +15976,17 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -11774,11 +15994,17 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -11786,48 +16012,72 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -11835,12 +16085,18 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX1250-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -11855,12 +16111,16 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_cmpxchg( ; GFX6-LABEL: local_wavefront_one_as_release_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -11868,12 +16128,18 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_cmpxchg( ; ; GFX7-LABEL: local_wavefront_one_as_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -11881,11 +16147,17 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_one_as_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -11893,11 +16165,17 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_one_as_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -11905,12 +16183,18 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -11918,11 +16202,17 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -11930,11 +16220,17 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -11942,11 +16238,17 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -11954,11 +16256,17 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_release_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -11966,48 +16274,72 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_one_as_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_wavefront_one_as_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_wavefront_one_as_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_wavefront_one_as_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -12015,12 +16347,18 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_cmpxchg( ; GFX1250-LABEL: local_wavefront_one_as_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -12035,12 +16373,16 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX6-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12048,12 +16390,18 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX7-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12061,11 +16409,17 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12073,11 +16427,17 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12085,12 +16445,18 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12098,11 +16464,17 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12110,11 +16482,17 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12122,11 +16500,17 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12134,11 +16518,17 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12146,48 +16536,72 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -12195,12 +16609,18 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX1250-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -12215,12 +16635,16 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX6-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12228,12 +16652,18 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX7-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12241,11 +16671,17 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12253,11 +16689,17 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12265,12 +16707,18 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12278,11 +16726,17 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12290,11 +16744,17 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12302,11 +16762,17 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12314,11 +16780,17 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12326,48 +16798,72 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -12375,12 +16871,18 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX1250-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -12395,12 +16897,16 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX6-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12408,12 +16914,18 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_cmpxchg( ; ; GFX7-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12421,11 +16933,17 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12433,11 +16951,17 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12445,12 +16969,18 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12458,11 +16988,17 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12470,11 +17006,17 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12482,11 +17024,17 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12494,11 +17042,17 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12506,48 +17060,72 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -12555,12 +17133,18 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX1250-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -12575,12 +17159,16 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX6-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12588,12 +17176,18 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_cmpxchg( ; ; GFX7-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12601,11 +17195,17 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12613,11 +17213,17 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12625,12 +17231,18 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12638,11 +17250,17 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12650,11 +17268,17 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12662,11 +17286,17 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12674,11 +17304,17 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12686,48 +17322,72 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -12735,12 +17395,18 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX1250-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -12755,12 +17421,16 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_cmpxchg( ; GFX6-LABEL: local_wavefront_one_as_release_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12768,12 +17438,18 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_cmpxchg( ; ; GFX7-LABEL: local_wavefront_one_as_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12781,11 +17457,17 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_one_as_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12793,11 +17475,17 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_one_as_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12805,12 +17493,18 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12818,11 +17512,17 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12830,11 +17530,17 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12842,11 +17548,17 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12854,11 +17566,17 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_release_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12866,48 +17584,72 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_one_as_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_wavefront_one_as_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_wavefront_one_as_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_wavefront_one_as_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -12915,12 +17657,18 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_cmpxchg( ; GFX1250-LABEL: local_wavefront_one_as_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -12935,12 +17683,16 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX6-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12948,12 +17700,18 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_cmpxchg( ; ; GFX7-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12961,11 +17719,17 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12973,11 +17737,17 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12985,12 +17755,18 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12998,11 +17774,17 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13010,11 +17792,17 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13022,11 +17810,17 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13034,11 +17828,17 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13046,48 +17846,72 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -13095,12 +17919,18 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX1250-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -13115,12 +17945,16 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX6-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13128,12 +17962,18 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_cmpxchg( ; ; GFX7-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13141,11 +17981,17 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13153,11 +17999,17 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13165,12 +18017,18 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13178,11 +18036,17 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13190,11 +18054,17 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13202,11 +18072,17 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13214,11 +18090,17 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13226,48 +18108,72 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -13275,12 +18181,18 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX1250-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -13295,12 +18207,16 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX6-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13308,12 +18224,18 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13321,11 +18243,17 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13333,11 +18261,17 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13345,12 +18279,18 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13358,11 +18298,17 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13370,11 +18316,17 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13382,11 +18334,17 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13394,11 +18352,17 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13406,48 +18370,72 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -13455,12 +18443,18 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX1250-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -13475,12 +18469,16 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX6-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13488,12 +18486,18 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13501,11 +18505,17 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13513,11 +18523,17 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13525,12 +18541,18 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13538,11 +18560,17 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13550,11 +18578,17 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13562,11 +18596,17 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13574,11 +18614,17 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13586,48 +18632,72 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -13635,12 +18705,18 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX1250-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -13655,12 +18731,16 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX6-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13668,12 +18748,18 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13681,11 +18767,17 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13693,11 +18785,17 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13705,12 +18803,18 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13718,11 +18822,17 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13730,11 +18840,17 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13742,11 +18858,17 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13754,11 +18876,17 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13766,48 +18894,72 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -13815,12 +18967,18 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX1250-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -13835,12 +18993,16 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX6-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13848,12 +19010,18 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13861,11 +19029,17 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13873,11 +19047,17 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13885,12 +19065,18 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13898,11 +19084,17 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13910,11 +19102,17 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13922,11 +19120,17 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13934,11 +19138,17 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13946,48 +19156,72 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -13995,12 +19229,18 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX1250-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -14015,12 +19255,16 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX6-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14028,12 +19272,18 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14041,11 +19291,17 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14053,11 +19309,17 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14065,12 +19327,18 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14078,11 +19346,17 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14090,11 +19364,17 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14102,11 +19382,17 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14114,11 +19400,17 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14126,48 +19418,72 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -14175,12 +19491,18 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX1250-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -14195,12 +19517,16 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_ret_cmpxch ; GFX6-LABEL: local_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -14213,6 +19539,12 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_ret_cmpxch ; GFX7-LABEL: local_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -14230,6 +19562,12 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_ret_cmpxch ; GFX10-WGP-LABEL: local_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -14245,6 +19583,12 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_ret_cmpxch ; GFX10-CU-LABEL: local_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -14260,6 +19604,12 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_ret_cmpxch ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -14277,6 +19627,12 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_ret_cmpxch ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14292,6 +19648,12 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_ret_cmpxch ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14307,6 +19669,12 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_ret_cmpxch ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14322,6 +19690,12 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_ret_cmpxch ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14337,6 +19711,12 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_ret_cmpxch ; GFX11-WGP-LABEL: local_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -14352,6 +19732,12 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_ret_cmpxch ; GFX11-CU-LABEL: local_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -14367,6 +19753,12 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_ret_cmpxch ; GFX12-WGP-LABEL: local_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -14382,6 +19774,12 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_ret_cmpxch ; GFX12-CU-LABEL: local_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -14398,6 +19796,12 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_ret_cmpxch ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -14422,12 +19826,16 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -14440,6 +19848,12 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; GFX7-LABEL: local_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -14457,6 +19871,12 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; GFX10-WGP-LABEL: local_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -14472,6 +19892,12 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; GFX10-CU-LABEL: local_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -14487,6 +19913,12 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -14504,6 +19936,12 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14519,6 +19957,12 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14534,6 +19978,12 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14549,6 +19999,12 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14564,6 +20020,12 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: local_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -14579,6 +20041,12 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: local_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -14594,6 +20062,12 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -14609,6 +20083,12 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -14625,6 +20105,12 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -14649,12 +20135,16 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_wavefront_one_as_release_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -14667,6 +20157,12 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_ret_cmpxchg( ; GFX7-LABEL: local_wavefront_one_as_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -14684,6 +20180,12 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_ret_cmpxchg( ; GFX10-WGP-LABEL: local_wavefront_one_as_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -14699,6 +20201,12 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_ret_cmpxchg( ; GFX10-CU-LABEL: local_wavefront_one_as_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -14714,6 +20222,12 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -14731,6 +20245,12 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14746,6 +20266,12 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14761,6 +20287,12 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14776,6 +20308,12 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_release_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14791,6 +20329,12 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: local_wavefront_one_as_release_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -14806,6 +20350,12 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: local_wavefront_one_as_release_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -14821,6 +20371,12 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_wavefront_one_as_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -14836,6 +20392,12 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_wavefront_one_as_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -14852,6 +20414,12 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -14876,12 +20444,16 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -14894,6 +20466,12 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX7-LABEL: local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -14911,6 +20489,12 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX10-WGP-LABEL: local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -14926,6 +20510,12 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX10-CU-LABEL: local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -14941,6 +20531,12 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -14958,6 +20554,12 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14973,6 +20575,12 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14988,6 +20596,12 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15003,6 +20617,12 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15018,6 +20638,12 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -15033,6 +20659,12 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -15048,6 +20680,12 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -15063,6 +20701,12 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -15079,6 +20723,12 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -15103,12 +20753,16 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -15121,6 +20775,12 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX7-LABEL: local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -15138,6 +20798,12 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX10-WGP-LABEL: local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -15153,6 +20819,12 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX10-CU-LABEL: local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -15168,6 +20840,12 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -15185,6 +20863,12 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15200,6 +20884,12 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15215,6 +20905,12 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15230,6 +20926,12 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15245,6 +20947,12 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -15260,6 +20968,12 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -15275,6 +20989,12 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -15290,6 +21010,12 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -15306,6 +21032,12 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -15330,12 +21062,16 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; GFX6-LABEL: local_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -15348,6 +21084,12 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; GFX7-LABEL: local_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -15365,6 +21107,12 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: local_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -15380,6 +21128,12 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; GFX10-CU-LABEL: local_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -15395,6 +21149,12 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -15412,6 +21172,12 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15427,6 +21193,12 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15442,6 +21214,12 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15457,6 +21235,12 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15472,6 +21256,12 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: local_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -15487,6 +21277,12 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: local_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -15502,6 +21298,12 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -15517,6 +21319,12 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -15533,6 +21341,12 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -15557,12 +21371,16 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX6-LABEL: local_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -15575,6 +21393,12 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX7-LABEL: local_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -15592,6 +21416,12 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: local_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -15607,6 +21437,12 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX10-CU-LABEL: local_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -15622,6 +21458,12 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -15639,6 +21481,12 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15654,6 +21502,12 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15669,6 +21523,12 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15684,6 +21544,12 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15699,6 +21565,12 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: local_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -15714,6 +21586,12 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: local_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -15729,6 +21607,12 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -15744,6 +21628,12 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -15760,6 +21650,12 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -15784,12 +21680,16 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX6-LABEL: local_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -15802,6 +21702,12 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX7-LABEL: local_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -15819,6 +21725,12 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: local_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -15834,6 +21746,12 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX10-CU-LABEL: local_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -15849,6 +21767,12 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -15866,6 +21790,12 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15881,6 +21811,12 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15896,6 +21832,12 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15911,6 +21853,12 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15926,6 +21874,12 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: local_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -15941,6 +21895,12 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: local_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -15956,6 +21916,12 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -15971,6 +21937,12 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -15987,6 +21959,12 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -16011,12 +21989,16 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX6-LABEL: local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -16029,6 +22011,12 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX7-LABEL: local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -16046,6 +22034,12 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16061,6 +22055,12 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX10-CU-LABEL: local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16076,6 +22076,12 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -16093,6 +22099,12 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16108,6 +22120,12 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16123,6 +22141,12 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16138,6 +22162,12 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16153,6 +22183,12 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16168,6 +22204,12 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16183,6 +22225,12 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -16198,6 +22246,12 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -16214,6 +22268,12 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -16238,12 +22298,16 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX6-LABEL: local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -16256,6 +22320,12 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX7-LABEL: local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -16273,6 +22343,12 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16288,6 +22364,12 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX10-CU-LABEL: local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16303,6 +22385,12 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -16320,6 +22408,12 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16335,6 +22429,12 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16350,6 +22450,12 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16365,6 +22471,12 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16380,6 +22492,12 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16395,6 +22513,12 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16410,6 +22534,12 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -16425,6 +22555,12 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -16441,6 +22577,12 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -16465,12 +22607,16 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -16483,6 +22629,12 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX7-LABEL: local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -16500,6 +22652,12 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX10-WGP-LABEL: local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16515,6 +22673,12 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX10-CU-LABEL: local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16530,6 +22694,12 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -16547,6 +22717,12 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16562,6 +22738,12 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16577,6 +22759,12 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16592,6 +22780,12 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16607,6 +22801,12 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16622,6 +22822,12 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16637,6 +22843,12 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -16652,6 +22864,12 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -16668,6 +22886,12 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -16692,12 +22916,16 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -16710,6 +22938,12 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX7-LABEL: local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -16727,6 +22961,12 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX10-WGP-LABEL: local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16742,6 +22982,12 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX10-CU-LABEL: local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16757,6 +23003,12 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -16774,6 +23026,12 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16789,6 +23047,12 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16804,6 +23068,12 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16819,6 +23089,12 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16834,6 +23110,12 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16849,6 +23131,12 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16864,6 +23152,12 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -16879,6 +23173,12 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -16895,6 +23195,12 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -16919,12 +23225,16 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -16937,6 +23247,12 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX7-LABEL: local_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -16954,6 +23270,12 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX10-WGP-LABEL: local_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16969,6 +23291,12 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX10-CU-LABEL: local_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16984,6 +23312,12 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -17001,6 +23335,12 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17016,6 +23356,12 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17031,6 +23377,12 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17046,6 +23398,12 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17061,6 +23419,12 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: local_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17076,6 +23440,12 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: local_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17091,6 +23461,12 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -17106,6 +23482,12 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -17122,6 +23504,12 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -17146,12 +23534,16 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -17164,6 +23556,12 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-LABEL: local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -17181,6 +23579,12 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-WGP-LABEL: local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17196,6 +23600,12 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-CU-LABEL: local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17211,6 +23621,12 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -17228,6 +23644,12 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17243,6 +23665,12 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17258,6 +23686,12 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17273,6 +23707,12 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17288,6 +23728,12 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17303,6 +23749,12 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17318,6 +23770,12 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -17333,6 +23791,12 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -17349,6 +23813,12 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -17373,12 +23843,16 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -17391,6 +23865,12 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-LABEL: local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -17408,6 +23888,12 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-WGP-LABEL: local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17423,6 +23909,12 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-CU-LABEL: local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17438,6 +23930,12 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -17455,6 +23953,12 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17470,6 +23974,12 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17485,6 +23995,12 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17500,6 +24016,12 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17515,6 +24037,12 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17530,6 +24058,12 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17545,6 +24079,12 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -17560,6 +24100,12 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -17576,6 +24122,12 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll index 50031ea866d1..9c5f8c4f62ef 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll @@ -33,135 +33,169 @@ define amdgpu_kernel void @local_workgroup_unordered_load( ; ; GFX7-LABEL: local_workgroup_unordered_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: ds_read_b32 v1, v0 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_workgroup_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_workgroup_unordered_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_unordered_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_unordered_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_workgroup_unordered_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_workgroup_unordered_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_workgroup_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: ds_load_b32 v1, v0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: ds_store_b32 v0, v1 @@ -169,11 +203,15 @@ define amdgpu_kernel void @local_workgroup_unordered_load( ; ; GFX12-CU-LABEL: local_workgroup_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 @@ -182,11 +220,15 @@ define amdgpu_kernel void @local_workgroup_unordered_load( ; GFX1250-LABEL: local_workgroup_unordered_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: ds_store_b32 v0, v1 @@ -217,135 +259,169 @@ define amdgpu_kernel void @local_workgroup_monotonic_load( ; ; GFX7-LABEL: local_workgroup_monotonic_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: ds_read_b32 v1, v0 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_workgroup_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_workgroup_monotonic_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_monotonic_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_monotonic_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_workgroup_monotonic_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_workgroup_monotonic_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_workgroup_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: ds_load_b32 v1, v0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: ds_store_b32 v0, v1 @@ -353,11 +429,15 @@ define amdgpu_kernel void @local_workgroup_monotonic_load( ; ; GFX12-CU-LABEL: local_workgroup_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 @@ -366,11 +446,15 @@ define amdgpu_kernel void @local_workgroup_monotonic_load( ; GFX1250-LABEL: local_workgroup_monotonic_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: ds_store_b32 v0, v1 @@ -401,10 +485,13 @@ define amdgpu_kernel void @local_workgroup_acquire_load( ; ; GFX7-LABEL: local_workgroup_acquire_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: ds_read_b32 v1, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -415,9 +502,12 @@ define amdgpu_kernel void @local_workgroup_acquire_load( ; ; GFX10-WGP-LABEL: local_workgroup_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -428,9 +518,12 @@ define amdgpu_kernel void @local_workgroup_acquire_load( ; ; GFX10-CU-LABEL: local_workgroup_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -440,10 +533,13 @@ define amdgpu_kernel void @local_workgroup_acquire_load( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_acquire_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -454,9 +550,12 @@ define amdgpu_kernel void @local_workgroup_acquire_load( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -466,22 +565,28 @@ define amdgpu_kernel void @local_workgroup_acquire_load( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_acquire_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -491,22 +596,28 @@ define amdgpu_kernel void @local_workgroup_acquire_load( ; ; GFX942-TGSPLIT-LABEL: local_workgroup_acquire_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v0 ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 -; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_workgroup_acquire_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -517,9 +628,12 @@ define amdgpu_kernel void @local_workgroup_acquire_load( ; ; GFX11-CU-LABEL: local_workgroup_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -529,25 +643,33 @@ define amdgpu_kernel void @local_workgroup_acquire_load( ; ; GFX12-WGP-LABEL: local_workgroup_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: ds_load_b32 v1, v0 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: ds_store_b32 v0, v1 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_workgroup_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm @@ -555,12 +677,16 @@ define amdgpu_kernel void @local_workgroup_acquire_load( ; GFX1250-LABEL: local_workgroup_acquire_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: ds_load_b32 v1, v0 ; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: ds_store_b32 v0, v1 ; GFX1250-NEXT: s_endpgm @@ -591,10 +717,13 @@ define amdgpu_kernel void @local_workgroup_seq_cst_load( ; ; GFX7-LABEL: local_workgroup_seq_cst_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_read_b32 v1, v0 @@ -606,9 +735,12 @@ define amdgpu_kernel void @local_workgroup_seq_cst_load( ; ; GFX10-WGP-LABEL: local_workgroup_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -621,9 +753,12 @@ define amdgpu_kernel void @local_workgroup_seq_cst_load( ; ; GFX10-CU-LABEL: local_workgroup_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -635,10 +770,13 @@ define amdgpu_kernel void @local_workgroup_seq_cst_load( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_seq_cst_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 @@ -650,9 +788,12 @@ define amdgpu_kernel void @local_workgroup_seq_cst_load( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -663,9 +804,12 @@ define amdgpu_kernel void @local_workgroup_seq_cst_load( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -677,9 +821,12 @@ define amdgpu_kernel void @local_workgroup_seq_cst_load( ; ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -690,9 +837,12 @@ define amdgpu_kernel void @local_workgroup_seq_cst_load( ; ; GFX942-TGSPLIT-LABEL: local_workgroup_seq_cst_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -704,9 +854,12 @@ define amdgpu_kernel void @local_workgroup_seq_cst_load( ; ; GFX11-WGP-LABEL: local_workgroup_seq_cst_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -719,9 +872,12 @@ define amdgpu_kernel void @local_workgroup_seq_cst_load( ; ; GFX11-CU-LABEL: local_workgroup_seq_cst_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -733,9 +889,12 @@ define amdgpu_kernel void @local_workgroup_seq_cst_load( ; ; GFX12-WGP-LABEL: local_workgroup_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 @@ -744,15 +903,19 @@ define amdgpu_kernel void @local_workgroup_seq_cst_load( ; GFX12-WGP-NEXT: ds_load_b32 v1, v0 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: ds_store_b32 v0, v1 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_workgroup_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 @@ -760,6 +923,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm @@ -767,14 +931,18 @@ define amdgpu_kernel void @local_workgroup_seq_cst_load( ; GFX1250-LABEL: local_workgroup_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_load_b32 v1, v0 ; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: ds_store_b32 v0, v1 ; GFX1250-NEXT: s_endpgm @@ -788,12 +956,14 @@ entry: define amdgpu_kernel void @local_workgroup_unordered_store( ; GFX6-LABEL: local_workgroup_unordered_store: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm @@ -801,6 +971,10 @@ define amdgpu_kernel void @local_workgroup_unordered_store( ; GFX7-LABEL: local_workgroup_unordered_store: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -812,6 +986,10 @@ define amdgpu_kernel void @local_workgroup_unordered_store( ; GFX10-WGP-LABEL: local_workgroup_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 @@ -822,6 +1000,10 @@ define amdgpu_kernel void @local_workgroup_unordered_store( ; GFX10-CU-LABEL: local_workgroup_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 @@ -832,6 +1014,10 @@ define amdgpu_kernel void @local_workgroup_unordered_store( ; SKIP-CACHE-INV-LABEL: local_workgroup_unordered_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -843,6 +1029,10 @@ define amdgpu_kernel void @local_workgroup_unordered_store( ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -853,6 +1043,10 @@ define amdgpu_kernel void @local_workgroup_unordered_store( ; GFX90A-TGSPLIT-LABEL: local_workgroup_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -863,6 +1057,10 @@ define amdgpu_kernel void @local_workgroup_unordered_store( ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_unordered_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -873,6 +1071,10 @@ define amdgpu_kernel void @local_workgroup_unordered_store( ; GFX942-TGSPLIT-LABEL: local_workgroup_unordered_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -883,6 +1085,10 @@ define amdgpu_kernel void @local_workgroup_unordered_store( ; GFX11-WGP-LABEL: local_workgroup_unordered_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -893,6 +1099,10 @@ define amdgpu_kernel void @local_workgroup_unordered_store( ; GFX11-CU-LABEL: local_workgroup_unordered_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -903,6 +1113,10 @@ define amdgpu_kernel void @local_workgroup_unordered_store( ; GFX12-WGP-LABEL: local_workgroup_unordered_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -913,6 +1127,10 @@ define amdgpu_kernel void @local_workgroup_unordered_store( ; GFX12-CU-LABEL: local_workgroup_unordered_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -924,6 +1142,10 @@ define amdgpu_kernel void @local_workgroup_unordered_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 @@ -939,12 +1161,14 @@ entry: define amdgpu_kernel void @local_workgroup_monotonic_store( ; GFX6-LABEL: local_workgroup_monotonic_store: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm @@ -952,6 +1176,10 @@ define amdgpu_kernel void @local_workgroup_monotonic_store( ; GFX7-LABEL: local_workgroup_monotonic_store: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -963,6 +1191,10 @@ define amdgpu_kernel void @local_workgroup_monotonic_store( ; GFX10-WGP-LABEL: local_workgroup_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 @@ -973,6 +1205,10 @@ define amdgpu_kernel void @local_workgroup_monotonic_store( ; GFX10-CU-LABEL: local_workgroup_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 @@ -983,6 +1219,10 @@ define amdgpu_kernel void @local_workgroup_monotonic_store( ; SKIP-CACHE-INV-LABEL: local_workgroup_monotonic_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -994,6 +1234,10 @@ define amdgpu_kernel void @local_workgroup_monotonic_store( ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -1004,6 +1248,10 @@ define amdgpu_kernel void @local_workgroup_monotonic_store( ; GFX90A-TGSPLIT-LABEL: local_workgroup_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -1014,6 +1262,10 @@ define amdgpu_kernel void @local_workgroup_monotonic_store( ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_monotonic_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -1024,6 +1276,10 @@ define amdgpu_kernel void @local_workgroup_monotonic_store( ; GFX942-TGSPLIT-LABEL: local_workgroup_monotonic_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -1034,6 +1290,10 @@ define amdgpu_kernel void @local_workgroup_monotonic_store( ; GFX11-WGP-LABEL: local_workgroup_monotonic_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -1044,6 +1304,10 @@ define amdgpu_kernel void @local_workgroup_monotonic_store( ; GFX11-CU-LABEL: local_workgroup_monotonic_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -1054,6 +1318,10 @@ define amdgpu_kernel void @local_workgroup_monotonic_store( ; GFX12-WGP-LABEL: local_workgroup_monotonic_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -1064,6 +1332,10 @@ define amdgpu_kernel void @local_workgroup_monotonic_store( ; GFX12-CU-LABEL: local_workgroup_monotonic_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -1075,6 +1347,10 @@ define amdgpu_kernel void @local_workgroup_monotonic_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 @@ -1090,12 +1366,14 @@ entry: define amdgpu_kernel void @local_workgroup_release_store( ; GFX6-LABEL: local_workgroup_release_store: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -1104,6 +1382,10 @@ define amdgpu_kernel void @local_workgroup_release_store( ; GFX7-LABEL: local_workgroup_release_store: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1116,6 +1398,10 @@ define amdgpu_kernel void @local_workgroup_release_store( ; GFX10-WGP-LABEL: local_workgroup_release_store: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 @@ -1128,6 +1414,10 @@ define amdgpu_kernel void @local_workgroup_release_store( ; GFX10-CU-LABEL: local_workgroup_release_store: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 @@ -1140,6 +1430,10 @@ define amdgpu_kernel void @local_workgroup_release_store( ; SKIP-CACHE-INV-LABEL: local_workgroup_release_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -1152,6 +1446,10 @@ define amdgpu_kernel void @local_workgroup_release_store( ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -1163,6 +1461,10 @@ define amdgpu_kernel void @local_workgroup_release_store( ; GFX90A-TGSPLIT-LABEL: local_workgroup_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -1174,6 +1476,10 @@ define amdgpu_kernel void @local_workgroup_release_store( ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_release_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -1185,6 +1491,10 @@ define amdgpu_kernel void @local_workgroup_release_store( ; GFX942-TGSPLIT-LABEL: local_workgroup_release_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -1196,6 +1506,10 @@ define amdgpu_kernel void @local_workgroup_release_store( ; GFX11-WGP-LABEL: local_workgroup_release_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -1208,6 +1522,10 @@ define amdgpu_kernel void @local_workgroup_release_store( ; GFX11-CU-LABEL: local_workgroup_release_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -1220,6 +1538,10 @@ define amdgpu_kernel void @local_workgroup_release_store( ; GFX12-WGP-LABEL: local_workgroup_release_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -1234,6 +1556,10 @@ define amdgpu_kernel void @local_workgroup_release_store( ; GFX12-CU-LABEL: local_workgroup_release_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -1249,6 +1575,10 @@ define amdgpu_kernel void @local_workgroup_release_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 @@ -1266,12 +1596,14 @@ entry: define amdgpu_kernel void @local_workgroup_seq_cst_store( ; GFX6-LABEL: local_workgroup_seq_cst_store: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -1280,6 +1612,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_store( ; GFX7-LABEL: local_workgroup_seq_cst_store: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1292,6 +1628,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_store( ; GFX10-WGP-LABEL: local_workgroup_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 @@ -1304,6 +1644,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_store( ; GFX10-CU-LABEL: local_workgroup_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 @@ -1316,6 +1660,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_store( ; SKIP-CACHE-INV-LABEL: local_workgroup_seq_cst_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -1328,6 +1676,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_store( ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -1339,6 +1691,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_store( ; GFX90A-TGSPLIT-LABEL: local_workgroup_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -1350,6 +1706,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_store( ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -1361,6 +1721,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_store( ; GFX942-TGSPLIT-LABEL: local_workgroup_seq_cst_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -1372,6 +1736,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_store( ; GFX11-WGP-LABEL: local_workgroup_seq_cst_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -1384,6 +1752,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_store( ; GFX11-CU-LABEL: local_workgroup_seq_cst_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -1396,6 +1768,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_store( ; GFX12-WGP-LABEL: local_workgroup_seq_cst_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -1410,6 +1786,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_store( ; GFX12-CU-LABEL: local_workgroup_seq_cst_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -1425,6 +1805,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 @@ -1443,133 +1827,183 @@ define amdgpu_kernel void @local_workgroup_monotonic_atomicrmw( ; GFX6-LABEL: local_workgroup_monotonic_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_workgroup_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_workgroup_monotonic_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_monotonic_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_monotonic_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_workgroup_monotonic_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_workgroup_monotonic_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_workgroup_monotonic_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_workgroup_monotonic_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm @@ -1577,10 +2011,14 @@ define amdgpu_kernel void @local_workgroup_monotonic_atomicrmw( ; GFX1250-LABEL: local_workgroup_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s0 ; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX1250-NEXT: s_endpgm @@ -1594,11 +2032,13 @@ define amdgpu_kernel void @local_workgroup_acquire_atomicrmw( ; GFX6-LABEL: local_workgroup_acquire_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -1606,11 +2046,15 @@ define amdgpu_kernel void @local_workgroup_acquire_atomicrmw( ; ; GFX7-LABEL: local_workgroup_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1618,10 +2062,14 @@ define amdgpu_kernel void @local_workgroup_acquire_atomicrmw( ; ; GFX10-WGP-LABEL: local_workgroup_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1630,10 +2078,14 @@ define amdgpu_kernel void @local_workgroup_acquire_atomicrmw( ; ; GFX10-CU-LABEL: local_workgroup_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1641,11 +2093,15 @@ define amdgpu_kernel void @local_workgroup_acquire_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_acquire_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -1653,10 +2109,14 @@ define amdgpu_kernel void @local_workgroup_acquire_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1664,10 +2124,14 @@ define amdgpu_kernel void @local_workgroup_acquire_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol @@ -1675,10 +2139,14 @@ define amdgpu_kernel void @local_workgroup_acquire_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_acquire_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1686,10 +2154,14 @@ define amdgpu_kernel void @local_workgroup_acquire_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: local_workgroup_acquire_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 @@ -1697,10 +2169,14 @@ define amdgpu_kernel void @local_workgroup_acquire_atomicrmw( ; ; GFX11-WGP-LABEL: local_workgroup_acquire_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1709,10 +2185,14 @@ define amdgpu_kernel void @local_workgroup_acquire_atomicrmw( ; ; GFX11-CU-LABEL: local_workgroup_acquire_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1720,10 +2200,14 @@ define amdgpu_kernel void @local_workgroup_acquire_atomicrmw( ; ; GFX12-WGP-LABEL: local_workgroup_acquire_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0 @@ -1732,10 +2216,14 @@ define amdgpu_kernel void @local_workgroup_acquire_atomicrmw( ; ; GFX12-CU-LABEL: local_workgroup_acquire_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 @@ -1744,10 +2232,14 @@ define amdgpu_kernel void @local_workgroup_acquire_atomicrmw( ; GFX1250-LABEL: local_workgroup_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s0 ; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -1762,11 +2254,13 @@ define amdgpu_kernel void @local_workgroup_release_atomicrmw( ; GFX6-LABEL: local_workgroup_release_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 @@ -1774,11 +2268,15 @@ define amdgpu_kernel void @local_workgroup_release_atomicrmw( ; ; GFX7-LABEL: local_workgroup_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 @@ -1786,10 +2284,14 @@ define amdgpu_kernel void @local_workgroup_release_atomicrmw( ; ; GFX10-WGP-LABEL: local_workgroup_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1798,10 +2300,14 @@ define amdgpu_kernel void @local_workgroup_release_atomicrmw( ; ; GFX10-CU-LABEL: local_workgroup_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1810,11 +2316,15 @@ define amdgpu_kernel void @local_workgroup_release_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_release_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 @@ -1822,10 +2332,14 @@ define amdgpu_kernel void @local_workgroup_release_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 @@ -1833,10 +2347,14 @@ define amdgpu_kernel void @local_workgroup_release_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 @@ -1844,10 +2362,14 @@ define amdgpu_kernel void @local_workgroup_release_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_release_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 @@ -1855,10 +2377,14 @@ define amdgpu_kernel void @local_workgroup_release_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: local_workgroup_release_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 @@ -1866,10 +2392,14 @@ define amdgpu_kernel void @local_workgroup_release_atomicrmw( ; ; GFX11-WGP-LABEL: local_workgroup_release_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1878,10 +2408,14 @@ define amdgpu_kernel void @local_workgroup_release_atomicrmw( ; ; GFX11-CU-LABEL: local_workgroup_release_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1890,10 +2424,14 @@ define amdgpu_kernel void @local_workgroup_release_atomicrmw( ; ; GFX12-WGP-LABEL: local_workgroup_release_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 @@ -1904,10 +2442,14 @@ define amdgpu_kernel void @local_workgroup_release_atomicrmw( ; ; GFX12-CU-LABEL: local_workgroup_release_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 @@ -1919,10 +2461,14 @@ define amdgpu_kernel void @local_workgroup_release_atomicrmw( ; GFX1250-LABEL: local_workgroup_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -1938,11 +2484,13 @@ define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw( ; GFX6-LABEL: local_workgroup_acq_rel_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 @@ -1951,11 +2499,15 @@ define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw( ; ; GFX7-LABEL: local_workgroup_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 @@ -1964,10 +2516,14 @@ define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw( ; ; GFX10-WGP-LABEL: local_workgroup_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1978,10 +2534,14 @@ define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw( ; ; GFX10-CU-LABEL: local_workgroup_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1991,11 +2551,15 @@ define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_acq_rel_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 @@ -2004,10 +2568,14 @@ define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 @@ -2016,10 +2584,14 @@ define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 @@ -2028,10 +2600,14 @@ define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 @@ -2040,10 +2616,14 @@ define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: local_workgroup_acq_rel_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 @@ -2052,10 +2632,14 @@ define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw( ; ; GFX11-WGP-LABEL: local_workgroup_acq_rel_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2066,10 +2650,14 @@ define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw( ; ; GFX11-CU-LABEL: local_workgroup_acq_rel_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2079,10 +2667,14 @@ define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw( ; ; GFX12-WGP-LABEL: local_workgroup_acq_rel_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 @@ -2095,10 +2687,14 @@ define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw( ; ; GFX12-CU-LABEL: local_workgroup_acq_rel_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 @@ -2111,10 +2707,14 @@ define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw( ; GFX1250-LABEL: local_workgroup_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -2131,11 +2731,13 @@ define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw( ; GFX6-LABEL: local_workgroup_seq_cst_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 @@ -2144,11 +2746,15 @@ define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw( ; ; GFX7-LABEL: local_workgroup_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 @@ -2157,10 +2763,14 @@ define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw( ; ; GFX10-WGP-LABEL: local_workgroup_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2171,10 +2781,14 @@ define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw( ; ; GFX10-CU-LABEL: local_workgroup_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2184,11 +2798,15 @@ define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_seq_cst_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 @@ -2197,10 +2815,14 @@ define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 @@ -2209,10 +2831,14 @@ define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 @@ -2221,10 +2847,14 @@ define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw( ; ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 @@ -2233,10 +2863,14 @@ define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw( ; ; GFX942-TGSPLIT-LABEL: local_workgroup_seq_cst_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 @@ -2245,10 +2879,14 @@ define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw( ; ; GFX11-WGP-LABEL: local_workgroup_seq_cst_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2259,10 +2897,14 @@ define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw( ; ; GFX11-CU-LABEL: local_workgroup_seq_cst_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2272,10 +2914,14 @@ define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw( ; ; GFX12-WGP-LABEL: local_workgroup_seq_cst_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 @@ -2288,10 +2934,14 @@ define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw( ; ; GFX12-CU-LABEL: local_workgroup_seq_cst_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 @@ -2304,10 +2954,14 @@ define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw( ; GFX1250-LABEL: local_workgroup_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -2324,11 +2978,13 @@ define amdgpu_kernel void @local_workgroup_acquire_ret_atomicrmw( ; GFX6-LABEL: local_workgroup_acquire_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -2340,6 +2996,10 @@ define amdgpu_kernel void @local_workgroup_acquire_ret_atomicrmw( ; GFX7-LABEL: local_workgroup_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2355,6 +3015,10 @@ define amdgpu_kernel void @local_workgroup_acquire_ret_atomicrmw( ; GFX10-WGP-LABEL: local_workgroup_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -2369,6 +3033,10 @@ define amdgpu_kernel void @local_workgroup_acquire_ret_atomicrmw( ; GFX10-CU-LABEL: local_workgroup_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -2382,6 +3050,10 @@ define amdgpu_kernel void @local_workgroup_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: local_workgroup_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -2397,6 +3069,10 @@ define amdgpu_kernel void @local_workgroup_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -2410,6 +3086,10 @@ define amdgpu_kernel void @local_workgroup_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-LABEL: local_workgroup_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -2424,6 +3104,10 @@ define amdgpu_kernel void @local_workgroup_acquire_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_acquire_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 @@ -2437,6 +3121,10 @@ define amdgpu_kernel void @local_workgroup_acquire_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: local_workgroup_acquire_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 @@ -2451,6 +3139,10 @@ define amdgpu_kernel void @local_workgroup_acquire_ret_atomicrmw( ; GFX11-WGP-LABEL: local_workgroup_acquire_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -2465,6 +3157,10 @@ define amdgpu_kernel void @local_workgroup_acquire_ret_atomicrmw( ; GFX11-CU-LABEL: local_workgroup_acquire_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2478,6 +3174,10 @@ define amdgpu_kernel void @local_workgroup_acquire_ret_atomicrmw( ; GFX12-WGP-LABEL: local_workgroup_acquire_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -2492,6 +3192,10 @@ define amdgpu_kernel void @local_workgroup_acquire_ret_atomicrmw( ; GFX12-CU-LABEL: local_workgroup_acquire_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2506,6 +3210,10 @@ define amdgpu_kernel void @local_workgroup_acquire_ret_atomicrmw( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 @@ -2526,11 +3234,13 @@ define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw( ; GFX6-LABEL: local_workgroup_acq_rel_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 @@ -2543,6 +3253,10 @@ define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw( ; GFX7-LABEL: local_workgroup_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2559,6 +3273,10 @@ define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw( ; GFX10-WGP-LABEL: local_workgroup_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -2575,6 +3293,10 @@ define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw( ; GFX10-CU-LABEL: local_workgroup_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -2590,6 +3312,10 @@ define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: local_workgroup_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -2606,6 +3332,10 @@ define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -2620,6 +3350,10 @@ define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-LABEL: local_workgroup_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -2635,6 +3369,10 @@ define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 @@ -2649,6 +3387,10 @@ define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: local_workgroup_acq_rel_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 @@ -2664,6 +3406,10 @@ define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw( ; GFX11-WGP-LABEL: local_workgroup_acq_rel_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -2680,6 +3426,10 @@ define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw( ; GFX11-CU-LABEL: local_workgroup_acq_rel_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2695,6 +3445,10 @@ define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw( ; GFX12-WGP-LABEL: local_workgroup_acq_rel_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -2713,6 +3467,10 @@ define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw( ; GFX12-CU-LABEL: local_workgroup_acq_rel_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2731,6 +3489,10 @@ define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 @@ -2753,11 +3515,13 @@ define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw( ; GFX6-LABEL: local_workgroup_seq_cst_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 @@ -2770,6 +3534,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw( ; GFX7-LABEL: local_workgroup_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2786,6 +3554,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw( ; GFX10-WGP-LABEL: local_workgroup_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -2802,6 +3574,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw( ; GFX10-CU-LABEL: local_workgroup_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -2817,6 +3593,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: local_workgroup_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -2833,6 +3613,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -2847,6 +3631,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-LABEL: local_workgroup_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -2862,6 +3650,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 @@ -2876,6 +3668,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: local_workgroup_seq_cst_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 @@ -2891,6 +3687,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw( ; GFX11-WGP-LABEL: local_workgroup_seq_cst_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -2907,6 +3707,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw( ; GFX11-CU-LABEL: local_workgroup_seq_cst_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2922,6 +3726,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw( ; GFX12-WGP-LABEL: local_workgroup_seq_cst_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -2940,6 +3748,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw( ; GFX12-CU-LABEL: local_workgroup_seq_cst_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2958,6 +3770,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 @@ -2980,12 +3796,16 @@ define amdgpu_kernel void @local_workgroup_monotonic_monotonic_cmpxchg( ; GFX6-LABEL: local_workgroup_monotonic_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -2993,12 +3813,18 @@ define amdgpu_kernel void @local_workgroup_monotonic_monotonic_cmpxchg( ; ; GFX7-LABEL: local_workgroup_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3006,11 +3832,17 @@ define amdgpu_kernel void @local_workgroup_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3018,11 +3850,17 @@ define amdgpu_kernel void @local_workgroup_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3030,12 +3868,18 @@ define amdgpu_kernel void @local_workgroup_monotonic_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3043,11 +3887,17 @@ define amdgpu_kernel void @local_workgroup_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3055,11 +3905,17 @@ define amdgpu_kernel void @local_workgroup_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3067,11 +3923,17 @@ define amdgpu_kernel void @local_workgroup_monotonic_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_monotonic_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3079,11 +3941,17 @@ define amdgpu_kernel void @local_workgroup_monotonic_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_workgroup_monotonic_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3091,48 +3959,72 @@ define amdgpu_kernel void @local_workgroup_monotonic_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_workgroup_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_workgroup_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_workgroup_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -3140,12 +4032,18 @@ define amdgpu_kernel void @local_workgroup_monotonic_monotonic_cmpxchg( ; GFX1250-LABEL: local_workgroup_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -3160,12 +4058,16 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_cmpxchg( ; GFX6-LABEL: local_workgroup_acquire_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3174,12 +4076,18 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_cmpxchg( ; ; GFX7-LABEL: local_workgroup_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3188,11 +4096,17 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3202,11 +4116,17 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3215,12 +4135,18 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3229,11 +4155,17 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3242,11 +4174,17 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3255,11 +4193,17 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_acquire_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3268,11 +4212,17 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_workgroup_acquire_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -3281,12 +4231,18 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -3295,12 +4251,18 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: local_workgroup_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -3308,12 +4270,18 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: local_workgroup_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0 @@ -3322,12 +4290,18 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: local_workgroup_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 @@ -3336,12 +4310,18 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_cmpxchg( ; GFX1250-LABEL: local_workgroup_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -3357,12 +4337,16 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg( ; GFX6-LABEL: local_workgroup_release_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -3371,12 +4355,18 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg( ; ; GFX7-LABEL: local_workgroup_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -3385,11 +4375,17 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3399,11 +4395,17 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3413,12 +4415,18 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -3427,11 +4435,17 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3440,11 +4454,17 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3453,11 +4473,17 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_release_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3466,11 +4492,17 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_workgroup_release_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3479,12 +4511,18 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3493,12 +4531,18 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: local_workgroup_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3507,12 +4551,18 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: local_workgroup_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 @@ -3523,12 +4573,18 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: local_workgroup_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 @@ -3540,12 +4596,18 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg( ; GFX1250-LABEL: local_workgroup_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -3562,12 +4624,16 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg( ; GFX6-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -3577,12 +4643,18 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg( ; ; GFX7-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -3592,11 +4664,17 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3608,11 +4686,17 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3623,12 +4707,18 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -3638,11 +4728,17 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3652,11 +4748,17 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3666,11 +4768,17 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3680,11 +4788,17 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3694,12 +4808,18 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3710,12 +4830,18 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3725,12 +4851,18 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 @@ -3743,12 +4875,18 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 @@ -3761,12 +4899,18 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg( ; GFX1250-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -3784,12 +4928,16 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg( ; GFX6-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -3799,12 +4947,18 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg( ; ; GFX7-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -3814,11 +4968,17 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3830,11 +4990,17 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3845,12 +5011,18 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -3860,11 +5032,17 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3874,11 +5052,17 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3888,11 +5072,17 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3902,11 +5092,17 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3916,12 +5112,18 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3932,12 +5134,18 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3947,12 +5155,18 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 @@ -3965,12 +5179,18 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 @@ -3983,12 +5203,18 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg( ; GFX1250-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -4006,12 +5232,16 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_cmpxchg( ; GFX6-LABEL: local_workgroup_monotonic_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4020,12 +5250,18 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_cmpxchg( ; ; GFX7-LABEL: local_workgroup_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4034,11 +5270,17 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4048,11 +5290,17 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4061,12 +5309,18 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4075,11 +5329,17 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4088,11 +5348,17 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4101,11 +5367,17 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_monotonic_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4114,11 +5386,17 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_workgroup_monotonic_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4127,12 +5405,18 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4141,12 +5425,18 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_cmpxchg( ; ; GFX11-CU-LABEL: local_workgroup_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4154,12 +5444,18 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: local_workgroup_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0 @@ -4168,12 +5464,18 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_cmpxchg( ; ; GFX12-CU-LABEL: local_workgroup_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 @@ -4182,12 +5484,18 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_cmpxchg( ; GFX1250-LABEL: local_workgroup_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -4203,12 +5511,16 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_cmpxchg( ; GFX6-LABEL: local_workgroup_acquire_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4217,12 +5529,18 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_cmpxchg( ; ; GFX7-LABEL: local_workgroup_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4231,11 +5549,17 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4245,11 +5569,17 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4258,12 +5588,18 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4272,11 +5608,17 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4285,11 +5627,17 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4298,11 +5646,17 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_acquire_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4311,11 +5665,17 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_workgroup_acquire_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -4324,12 +5684,18 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4338,12 +5704,18 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_cmpxchg( ; ; GFX11-CU-LABEL: local_workgroup_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4351,12 +5723,18 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: local_workgroup_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0 @@ -4365,12 +5743,18 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_cmpxchg( ; ; GFX12-CU-LABEL: local_workgroup_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 @@ -4379,12 +5763,18 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_cmpxchg( ; GFX1250-LABEL: local_workgroup_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -4400,12 +5790,16 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg( ; GFX6-LABEL: local_workgroup_release_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -4415,12 +5809,18 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg( ; ; GFX7-LABEL: local_workgroup_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -4430,11 +5830,17 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4446,11 +5852,17 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4461,12 +5873,18 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -4476,11 +5894,17 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4490,11 +5914,17 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4504,11 +5934,17 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_release_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4518,11 +5954,17 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_workgroup_release_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4532,12 +5974,18 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4548,12 +5996,18 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg( ; ; GFX11-CU-LABEL: local_workgroup_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4563,12 +6017,18 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: local_workgroup_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 @@ -4581,12 +6041,18 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg( ; ; GFX12-CU-LABEL: local_workgroup_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 @@ -4599,12 +6065,18 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg( ; GFX1250-LABEL: local_workgroup_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -4622,12 +6094,16 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg( ; GFX6-LABEL: local_workgroup_acq_rel_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -4637,12 +6113,18 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg( ; ; GFX7-LABEL: local_workgroup_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -4652,11 +6134,17 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4668,11 +6156,17 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4683,12 +6177,18 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -4698,11 +6198,17 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4712,11 +6218,17 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4726,11 +6238,17 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4740,11 +6258,17 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_workgroup_acq_rel_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4754,12 +6278,18 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4770,12 +6300,18 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg( ; ; GFX11-CU-LABEL: local_workgroup_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4785,12 +6321,18 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: local_workgroup_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 @@ -4803,12 +6345,18 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg( ; ; GFX12-CU-LABEL: local_workgroup_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 @@ -4821,12 +6369,18 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg( ; GFX1250-LABEL: local_workgroup_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -4844,12 +6398,16 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg( ; GFX6-LABEL: local_workgroup_seq_cst_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -4859,12 +6417,18 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg( ; ; GFX7-LABEL: local_workgroup_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -4874,11 +6438,17 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4890,11 +6460,17 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4905,12 +6481,18 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -4920,11 +6502,17 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4934,11 +6522,17 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4948,11 +6542,17 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4962,11 +6562,17 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_workgroup_seq_cst_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4976,12 +6582,18 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4992,12 +6604,18 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg( ; ; GFX11-CU-LABEL: local_workgroup_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -5007,12 +6625,18 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: local_workgroup_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 @@ -5025,12 +6649,18 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg( ; ; GFX12-CU-LABEL: local_workgroup_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 @@ -5043,12 +6673,18 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg( ; GFX1250-LABEL: local_workgroup_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -5066,12 +6702,16 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg( ; GFX6-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5081,12 +6721,18 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -5096,11 +6742,17 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5112,11 +6764,17 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5127,12 +6785,18 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -5142,11 +6806,17 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5156,11 +6826,17 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5170,11 +6846,17 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5184,11 +6866,17 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5198,12 +6886,18 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -5214,12 +6908,18 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -5229,12 +6929,18 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 @@ -5247,12 +6953,18 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 @@ -5265,12 +6977,18 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg( ; GFX1250-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -5288,12 +7006,16 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg( ; GFX6-LABEL: local_workgroup_acquire_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5303,12 +7025,18 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_workgroup_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -5318,11 +7046,17 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5334,11 +7068,17 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5349,12 +7089,18 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -5364,11 +7110,17 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5378,11 +7130,17 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5392,11 +7150,17 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_acquire_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5406,11 +7170,17 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_workgroup_acquire_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5420,12 +7190,18 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -5436,12 +7212,18 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: local_workgroup_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -5451,12 +7233,18 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: local_workgroup_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 @@ -5469,12 +7257,18 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: local_workgroup_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 @@ -5487,12 +7281,18 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg( ; GFX1250-LABEL: local_workgroup_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -5510,12 +7310,16 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg( ; GFX6-LABEL: local_workgroup_release_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5525,12 +7329,18 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_workgroup_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -5540,11 +7350,17 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5556,11 +7372,17 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5571,12 +7393,18 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -5586,11 +7414,17 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5600,11 +7434,17 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5614,11 +7454,17 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_release_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5628,11 +7474,17 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_workgroup_release_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5642,12 +7494,18 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -5658,12 +7516,18 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: local_workgroup_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -5673,12 +7537,18 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: local_workgroup_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 @@ -5691,12 +7561,18 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: local_workgroup_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 @@ -5709,12 +7585,18 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg( ; GFX1250-LABEL: local_workgroup_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -5732,12 +7614,16 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX6-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5747,12 +7633,18 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -5762,11 +7654,17 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5778,11 +7676,17 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5793,12 +7697,18 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -5808,11 +7718,17 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5822,11 +7738,17 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5836,11 +7758,17 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5850,11 +7778,17 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5864,12 +7798,18 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -5880,12 +7820,18 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -5895,12 +7841,18 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 @@ -5913,12 +7865,18 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 @@ -5931,12 +7889,18 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX1250-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -5954,12 +7918,16 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX6-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5969,12 +7937,18 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -5984,11 +7958,17 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6000,11 +7980,17 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6015,12 +8001,18 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -6030,11 +8022,17 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6044,11 +8042,17 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6058,11 +8062,17 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6072,11 +8082,17 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6086,12 +8102,18 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6102,12 +8124,18 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6117,12 +8145,18 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 @@ -6135,12 +8169,18 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 @@ -6153,12 +8193,18 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX1250-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -6176,12 +8222,16 @@ define amdgpu_kernel void @local_workgroup_monotonic_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -6194,6 +8244,12 @@ define amdgpu_kernel void @local_workgroup_monotonic_monotonic_ret_cmpxchg( ; GFX7-LABEL: local_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -6211,6 +8267,12 @@ define amdgpu_kernel void @local_workgroup_monotonic_monotonic_ret_cmpxchg( ; GFX10-WGP-LABEL: local_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -6226,6 +8288,12 @@ define amdgpu_kernel void @local_workgroup_monotonic_monotonic_ret_cmpxchg( ; GFX10-CU-LABEL: local_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6241,6 +8309,12 @@ define amdgpu_kernel void @local_workgroup_monotonic_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_workgroup_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -6258,6 +8332,12 @@ define amdgpu_kernel void @local_workgroup_monotonic_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6273,6 +8353,12 @@ define amdgpu_kernel void @local_workgroup_monotonic_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6288,6 +8374,12 @@ define amdgpu_kernel void @local_workgroup_monotonic_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6303,6 +8395,12 @@ define amdgpu_kernel void @local_workgroup_monotonic_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6318,6 +8416,12 @@ define amdgpu_kernel void @local_workgroup_monotonic_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: local_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -6333,6 +8437,12 @@ define amdgpu_kernel void @local_workgroup_monotonic_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: local_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6348,6 +8458,12 @@ define amdgpu_kernel void @local_workgroup_monotonic_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -6363,6 +8479,12 @@ define amdgpu_kernel void @local_workgroup_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -6379,6 +8501,12 @@ define amdgpu_kernel void @local_workgroup_monotonic_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -6403,12 +8531,16 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -6421,6 +8553,12 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX7-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -6438,6 +8576,12 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX10-WGP-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -6454,6 +8598,12 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX10-CU-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6469,6 +8619,12 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -6486,6 +8642,12 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6501,6 +8663,12 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6517,6 +8685,12 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6532,6 +8706,12 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6548,6 +8728,12 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -6564,6 +8750,12 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6579,6 +8771,12 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -6595,6 +8793,12 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -6611,6 +8815,12 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -6635,12 +8845,16 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_workgroup_release_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6654,6 +8868,12 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg( ; GFX7-LABEL: local_workgroup_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -6672,6 +8892,12 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg( ; GFX10-WGP-LABEL: local_workgroup_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -6689,6 +8915,12 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg( ; GFX10-CU-LABEL: local_workgroup_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6706,6 +8938,12 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_workgroup_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -6724,6 +8962,12 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6740,6 +8984,12 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_workgroup_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6756,6 +9006,12 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_release_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6772,6 +9028,12 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_workgroup_release_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6788,6 +9050,12 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: local_workgroup_release_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -6805,6 +9073,12 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: local_workgroup_release_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6822,6 +9096,12 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_workgroup_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -6841,6 +9121,12 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_workgroup_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -6861,6 +9147,12 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -6887,12 +9179,16 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6906,6 +9202,12 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX7-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -6924,6 +9226,12 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX10-WGP-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -6942,6 +9250,12 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX10-CU-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6959,6 +9273,12 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -6977,6 +9297,12 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6993,6 +9319,12 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7010,6 +9342,12 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7026,6 +9364,12 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7043,6 +9387,12 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7061,6 +9411,12 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -7078,6 +9434,12 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -7098,6 +9460,12 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -7118,6 +9486,12 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -7144,12 +9518,16 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7163,6 +9541,12 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX7-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -7181,6 +9565,12 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX10-WGP-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7199,6 +9589,12 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX10-CU-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -7216,6 +9612,12 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -7234,6 +9636,12 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7250,6 +9658,12 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7267,6 +9681,12 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7283,6 +9703,12 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7300,6 +9726,12 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7318,6 +9750,12 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -7335,6 +9773,12 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -7355,6 +9799,12 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -7375,6 +9825,12 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -7401,12 +9857,16 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX6-LABEL: local_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -7419,6 +9879,12 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX7-LABEL: local_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -7436,6 +9902,12 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: local_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7452,6 +9924,12 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX10-CU-LABEL: local_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -7467,6 +9945,12 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_workgroup_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -7484,6 +9968,12 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7499,6 +9989,12 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7515,6 +10011,12 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7530,6 +10032,12 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7546,6 +10054,12 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: local_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7562,6 +10076,12 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: local_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -7577,6 +10097,12 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -7593,6 +10119,12 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -7609,6 +10141,12 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -7633,12 +10171,16 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_ret_cmpxchg( ; GFX6-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -7651,6 +10193,12 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_ret_cmpxchg( ; GFX7-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -7668,6 +10216,12 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7684,6 +10238,12 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_ret_cmpxchg( ; GFX10-CU-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -7699,6 +10259,12 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -7716,6 +10282,12 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7731,6 +10303,12 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7747,6 +10325,12 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7762,6 +10346,12 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7778,6 +10368,12 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7794,6 +10390,12 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -7809,6 +10411,12 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -7825,6 +10433,12 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -7841,6 +10455,12 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -7865,12 +10485,16 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg( ; GFX6-LABEL: local_workgroup_release_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7884,6 +10508,12 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg( ; GFX7-LABEL: local_workgroup_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -7902,6 +10532,12 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: local_workgroup_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7920,6 +10556,12 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg( ; GFX10-CU-LABEL: local_workgroup_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -7937,6 +10579,12 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_workgroup_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -7955,6 +10603,12 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7971,6 +10625,12 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_workgroup_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7988,6 +10648,12 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_release_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8004,6 +10670,12 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_workgroup_release_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8021,6 +10693,12 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: local_workgroup_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -8039,6 +10717,12 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: local_workgroup_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -8056,6 +10740,12 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_workgroup_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -8076,6 +10766,12 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_workgroup_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -8096,6 +10792,12 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -8122,12 +10824,16 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX6-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -8141,6 +10847,12 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX7-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -8159,6 +10871,12 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -8177,6 +10895,12 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX10-CU-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -8194,6 +10918,12 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -8212,6 +10942,12 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8228,6 +10964,12 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8245,6 +10987,12 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8261,6 +11009,12 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8278,6 +11032,12 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -8296,6 +11056,12 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -8313,6 +11079,12 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -8333,6 +11105,12 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -8353,6 +11131,12 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -8379,12 +11163,16 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX6-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -8398,6 +11186,12 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX7-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -8416,6 +11210,12 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -8434,6 +11234,12 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX10-CU-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -8451,6 +11257,12 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -8469,6 +11281,12 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8485,6 +11303,12 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8502,6 +11326,12 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8518,6 +11348,12 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8535,6 +11371,12 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -8553,6 +11395,12 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -8570,6 +11418,12 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -8590,6 +11444,12 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -8610,6 +11470,12 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -8636,12 +11502,16 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -8655,6 +11525,12 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX7-LABEL: local_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -8673,6 +11549,12 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX10-WGP-LABEL: local_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -8691,6 +11573,12 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX10-CU-LABEL: local_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -8708,6 +11596,12 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_workgroup_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -8726,6 +11620,12 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8742,6 +11642,12 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8759,6 +11665,12 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8775,6 +11687,12 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8792,6 +11710,12 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: local_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -8810,6 +11734,12 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: local_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -8827,6 +11757,12 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -8847,6 +11783,12 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -8867,6 +11809,12 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -8893,12 +11841,16 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -8912,6 +11864,12 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX7-LABEL: local_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -8930,6 +11888,12 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX10-WGP-LABEL: local_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -8948,6 +11912,12 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX10-CU-LABEL: local_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -8965,6 +11935,12 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_workgroup_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -8983,6 +11959,12 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8999,6 +11981,12 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9016,6 +12004,12 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9032,6 +12026,12 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9049,6 +12049,12 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: local_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -9067,6 +12073,12 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: local_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -9084,6 +12096,12 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -9104,6 +12122,12 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -9124,6 +12148,12 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -9150,12 +12180,16 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_workgroup_release_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -9169,6 +12203,12 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg( ; GFX7-LABEL: local_workgroup_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -9187,6 +12227,12 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg( ; GFX10-WGP-LABEL: local_workgroup_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -9205,6 +12251,12 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg( ; GFX10-CU-LABEL: local_workgroup_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -9222,6 +12274,12 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_workgroup_release_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -9240,6 +12298,12 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9256,6 +12320,12 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_workgroup_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9273,6 +12343,12 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_release_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9289,6 +12365,12 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_workgroup_release_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9306,6 +12388,12 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: local_workgroup_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -9324,6 +12412,12 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: local_workgroup_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -9341,6 +12435,12 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_workgroup_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -9361,6 +12461,12 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_workgroup_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -9381,6 +12487,12 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -9407,12 +12519,16 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -9426,6 +12542,12 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-LABEL: local_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -9444,6 +12566,12 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-WGP-LABEL: local_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -9462,6 +12590,12 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-CU-LABEL: local_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -9479,6 +12613,12 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -9497,6 +12637,12 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9513,6 +12659,12 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9530,6 +12682,12 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9546,6 +12704,12 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9563,6 +12727,12 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: local_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -9581,6 +12751,12 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: local_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -9598,6 +12774,12 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -9618,6 +12800,12 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -9638,6 +12826,12 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -9664,12 +12858,16 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -9683,6 +12881,12 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -9701,6 +12905,12 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-WGP-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -9719,6 +12929,12 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-CU-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -9736,6 +12952,12 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -9754,6 +12976,12 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9770,6 +12998,12 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9787,6 +13021,12 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9803,6 +13043,12 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9820,6 +13066,12 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -9838,6 +13090,12 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -9855,6 +13113,12 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -9875,6 +13139,12 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -9895,6 +13165,12 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -9936,135 +13212,169 @@ define amdgpu_kernel void @local_workgroup_one_as_unordered_load( ; ; GFX7-LABEL: local_workgroup_one_as_unordered_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: ds_read_b32 v1, v0 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_one_as_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_workgroup_one_as_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_unordered_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_one_as_unordered_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_unordered_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_workgroup_one_as_unordered_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_workgroup_one_as_unordered_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_workgroup_one_as_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: ds_load_b32 v1, v0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: ds_store_b32 v0, v1 @@ -10072,11 +13382,15 @@ define amdgpu_kernel void @local_workgroup_one_as_unordered_load( ; ; GFX12-CU-LABEL: local_workgroup_one_as_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 @@ -10085,11 +13399,15 @@ define amdgpu_kernel void @local_workgroup_one_as_unordered_load( ; GFX1250-LABEL: local_workgroup_one_as_unordered_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: ds_store_b32 v0, v1 @@ -10120,135 +13438,169 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_load( ; ; GFX7-LABEL: local_workgroup_one_as_monotonic_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: ds_read_b32 v1, v0 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_one_as_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_workgroup_one_as_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_monotonic_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_workgroup_one_as_monotonic_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_workgroup_one_as_monotonic_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_workgroup_one_as_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: ds_load_b32 v1, v0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: ds_store_b32 v0, v1 @@ -10256,11 +13608,15 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_load( ; ; GFX12-CU-LABEL: local_workgroup_one_as_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 @@ -10269,11 +13625,15 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_load( ; GFX1250-LABEL: local_workgroup_one_as_monotonic_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: ds_store_b32 v0, v1 @@ -10304,135 +13664,169 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_load( ; ; GFX7-LABEL: local_workgroup_one_as_acquire_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: ds_read_b32 v1, v0 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_one_as_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_workgroup_one_as_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acquire_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_acquire_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_workgroup_one_as_acquire_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_workgroup_one_as_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_workgroup_one_as_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: ds_load_b32 v1, v0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: ds_store_b32 v0, v1 @@ -10440,11 +13834,15 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_load( ; ; GFX12-CU-LABEL: local_workgroup_one_as_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 @@ -10453,11 +13851,15 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_load( ; GFX1250-LABEL: local_workgroup_one_as_acquire_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: ds_store_b32 v0, v1 @@ -10488,135 +13890,169 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_load( ; ; GFX7-LABEL: local_workgroup_one_as_seq_cst_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: ds_read_b32 v1, v0 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_workgroup_one_as_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_seq_cst_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_workgroup_one_as_seq_cst_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_workgroup_one_as_seq_cst_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_workgroup_one_as_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: ds_load_b32 v1, v0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: ds_store_b32 v0, v1 @@ -10624,11 +14060,15 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_load( ; ; GFX12-CU-LABEL: local_workgroup_one_as_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 @@ -10637,11 +14077,15 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_load( ; GFX1250-LABEL: local_workgroup_one_as_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: ds_store_b32 v0, v1 @@ -10656,12 +14100,14 @@ entry: define amdgpu_kernel void @local_workgroup_one_as_unordered_store( ; GFX6-LABEL: local_workgroup_one_as_unordered_store: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm @@ -10669,6 +14115,10 @@ define amdgpu_kernel void @local_workgroup_one_as_unordered_store( ; GFX7-LABEL: local_workgroup_one_as_unordered_store: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10680,6 +14130,10 @@ define amdgpu_kernel void @local_workgroup_one_as_unordered_store( ; GFX10-WGP-LABEL: local_workgroup_one_as_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 @@ -10690,6 +14144,10 @@ define amdgpu_kernel void @local_workgroup_one_as_unordered_store( ; GFX10-CU-LABEL: local_workgroup_one_as_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 @@ -10700,6 +14158,10 @@ define amdgpu_kernel void @local_workgroup_one_as_unordered_store( ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_unordered_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -10711,6 +14173,10 @@ define amdgpu_kernel void @local_workgroup_one_as_unordered_store( ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -10721,6 +14187,10 @@ define amdgpu_kernel void @local_workgroup_one_as_unordered_store( ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -10731,6 +14201,10 @@ define amdgpu_kernel void @local_workgroup_one_as_unordered_store( ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_one_as_unordered_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -10741,6 +14215,10 @@ define amdgpu_kernel void @local_workgroup_one_as_unordered_store( ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_unordered_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -10751,6 +14229,10 @@ define amdgpu_kernel void @local_workgroup_one_as_unordered_store( ; GFX11-WGP-LABEL: local_workgroup_one_as_unordered_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -10761,6 +14243,10 @@ define amdgpu_kernel void @local_workgroup_one_as_unordered_store( ; GFX11-CU-LABEL: local_workgroup_one_as_unordered_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -10771,6 +14257,10 @@ define amdgpu_kernel void @local_workgroup_one_as_unordered_store( ; GFX12-WGP-LABEL: local_workgroup_one_as_unordered_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -10781,6 +14271,10 @@ define amdgpu_kernel void @local_workgroup_one_as_unordered_store( ; GFX12-CU-LABEL: local_workgroup_one_as_unordered_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -10792,6 +14286,10 @@ define amdgpu_kernel void @local_workgroup_one_as_unordered_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 @@ -10807,12 +14305,14 @@ entry: define amdgpu_kernel void @local_workgroup_one_as_monotonic_store( ; GFX6-LABEL: local_workgroup_one_as_monotonic_store: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm @@ -10820,6 +14320,10 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_store( ; GFX7-LABEL: local_workgroup_one_as_monotonic_store: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10831,6 +14335,10 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_store( ; GFX10-WGP-LABEL: local_workgroup_one_as_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 @@ -10841,6 +14349,10 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_store( ; GFX10-CU-LABEL: local_workgroup_one_as_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 @@ -10851,6 +14363,10 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_store( ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_monotonic_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -10862,6 +14378,10 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_store( ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -10872,6 +14392,10 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_store( ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -10882,6 +14406,10 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_store( ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -10892,6 +14420,10 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_store( ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -10902,6 +14434,10 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_store( ; GFX11-WGP-LABEL: local_workgroup_one_as_monotonic_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -10912,6 +14448,10 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_store( ; GFX11-CU-LABEL: local_workgroup_one_as_monotonic_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -10922,6 +14462,10 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_store( ; GFX12-WGP-LABEL: local_workgroup_one_as_monotonic_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -10932,6 +14476,10 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_store( ; GFX12-CU-LABEL: local_workgroup_one_as_monotonic_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -10943,6 +14491,10 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 @@ -10958,12 +14510,14 @@ entry: define amdgpu_kernel void @local_workgroup_one_as_release_store( ; GFX6-LABEL: local_workgroup_one_as_release_store: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm @@ -10971,6 +14525,10 @@ define amdgpu_kernel void @local_workgroup_one_as_release_store( ; GFX7-LABEL: local_workgroup_one_as_release_store: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10982,6 +14540,10 @@ define amdgpu_kernel void @local_workgroup_one_as_release_store( ; GFX10-WGP-LABEL: local_workgroup_one_as_release_store: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 @@ -10992,6 +14554,10 @@ define amdgpu_kernel void @local_workgroup_one_as_release_store( ; GFX10-CU-LABEL: local_workgroup_one_as_release_store: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 @@ -11002,6 +14568,10 @@ define amdgpu_kernel void @local_workgroup_one_as_release_store( ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_release_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -11013,6 +14583,10 @@ define amdgpu_kernel void @local_workgroup_one_as_release_store( ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -11023,6 +14597,10 @@ define amdgpu_kernel void @local_workgroup_one_as_release_store( ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -11033,6 +14611,10 @@ define amdgpu_kernel void @local_workgroup_one_as_release_store( ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -11043,6 +14625,10 @@ define amdgpu_kernel void @local_workgroup_one_as_release_store( ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_release_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -11053,6 +14639,10 @@ define amdgpu_kernel void @local_workgroup_one_as_release_store( ; GFX11-WGP-LABEL: local_workgroup_one_as_release_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -11063,6 +14653,10 @@ define amdgpu_kernel void @local_workgroup_one_as_release_store( ; GFX11-CU-LABEL: local_workgroup_one_as_release_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -11073,6 +14667,10 @@ define amdgpu_kernel void @local_workgroup_one_as_release_store( ; GFX12-WGP-LABEL: local_workgroup_one_as_release_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -11083,6 +14681,10 @@ define amdgpu_kernel void @local_workgroup_one_as_release_store( ; GFX12-CU-LABEL: local_workgroup_one_as_release_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -11094,6 +14696,10 @@ define amdgpu_kernel void @local_workgroup_one_as_release_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 @@ -11109,12 +14715,14 @@ entry: define amdgpu_kernel void @local_workgroup_one_as_seq_cst_store( ; GFX6-LABEL: local_workgroup_one_as_seq_cst_store: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm @@ -11122,6 +14730,10 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_store( ; GFX7-LABEL: local_workgroup_one_as_seq_cst_store: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11133,6 +14745,10 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_store( ; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 @@ -11143,6 +14759,10 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_store( ; GFX10-CU-LABEL: local_workgroup_one_as_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 @@ -11153,6 +14773,10 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_store( ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_seq_cst_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -11164,6 +14788,10 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_store( ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -11174,6 +14802,10 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_store( ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -11184,6 +14816,10 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_store( ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -11194,6 +14830,10 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_store( ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -11204,6 +14844,10 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_store( ; GFX11-WGP-LABEL: local_workgroup_one_as_seq_cst_store: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -11214,6 +14858,10 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_store( ; GFX11-CU-LABEL: local_workgroup_one_as_seq_cst_store: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -11224,6 +14872,10 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_store( ; GFX12-WGP-LABEL: local_workgroup_one_as_seq_cst_store: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -11234,6 +14886,10 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_store( ; GFX12-CU-LABEL: local_workgroup_one_as_seq_cst_store: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -11245,6 +14901,10 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 @@ -11261,133 +14921,183 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_atomicrmw( ; GFX6-LABEL: local_workgroup_one_as_monotonic_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_one_as_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_one_as_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_workgroup_one_as_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_monotonic_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_workgroup_one_as_monotonic_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_workgroup_one_as_monotonic_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_workgroup_one_as_monotonic_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_workgroup_one_as_monotonic_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm @@ -11395,10 +15105,14 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_atomicrmw( ; GFX1250-LABEL: local_workgroup_one_as_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s0 ; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX1250-NEXT: s_endpgm @@ -11412,133 +15126,183 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_atomicrmw( ; GFX6-LABEL: local_workgroup_one_as_acquire_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_one_as_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_one_as_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_workgroup_one_as_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acquire_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_acquire_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_workgroup_one_as_acquire_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_workgroup_one_as_acquire_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_workgroup_one_as_acquire_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_workgroup_one_as_acquire_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm @@ -11546,10 +15310,14 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_atomicrmw( ; GFX1250-LABEL: local_workgroup_one_as_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s0 ; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX1250-NEXT: s_endpgm @@ -11563,133 +15331,183 @@ define amdgpu_kernel void @local_workgroup_one_as_release_atomicrmw( ; GFX6-LABEL: local_workgroup_one_as_release_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_one_as_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_one_as_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_workgroup_one_as_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_release_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_release_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_workgroup_one_as_release_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_workgroup_one_as_release_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_workgroup_one_as_release_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_workgroup_one_as_release_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm @@ -11697,10 +15515,14 @@ define amdgpu_kernel void @local_workgroup_one_as_release_atomicrmw( ; GFX1250-LABEL: local_workgroup_one_as_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s0 ; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX1250-NEXT: s_endpgm @@ -11714,133 +15536,183 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_atomicrmw( ; GFX6-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm @@ -11848,10 +15720,14 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_atomicrmw( ; GFX1250-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s0 ; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX1250-NEXT: s_endpgm @@ -11865,133 +15741,183 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_atomicrmw( ; GFX6-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm @@ -11999,10 +15925,14 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_atomicrmw( ; GFX1250-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s0 ; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX1250-NEXT: s_endpgm @@ -12016,11 +15946,13 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_ret_atomicrmw( ; GFX6-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 ; GFX6-NEXT: s_mov_b32 m0, -1 @@ -12032,6 +15964,10 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_ret_atomicrmw( ; GFX7-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12047,6 +15983,10 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_ret_atomicrmw( ; GFX10-WGP-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -12060,6 +16000,10 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_ret_atomicrmw( ; GFX10-CU-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -12073,6 +16017,10 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -12088,6 +16036,10 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -12101,6 +16053,10 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -12114,6 +16070,10 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 @@ -12127,6 +16087,10 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 @@ -12140,6 +16104,10 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_ret_atomicrmw( ; GFX11-WGP-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -12153,6 +16121,10 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_ret_atomicrmw( ; GFX11-CU-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -12166,6 +16138,10 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_ret_atomicrmw( ; GFX12-WGP-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -12179,6 +16155,10 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_ret_atomicrmw( ; GFX12-CU-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -12193,6 +16173,10 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_ret_atomicrmw( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 @@ -12213,11 +16197,13 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX6-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 ; GFX6-NEXT: s_mov_b32 m0, -1 @@ -12229,6 +16215,10 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX7-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12244,6 +16234,10 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX10-WGP-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -12257,6 +16251,10 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX10-CU-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -12270,6 +16268,10 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -12285,6 +16287,10 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -12298,6 +16304,10 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -12311,6 +16321,10 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 @@ -12324,6 +16338,10 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 @@ -12337,6 +16355,10 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX11-WGP-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -12350,6 +16372,10 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX11-CU-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -12363,6 +16389,10 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX12-WGP-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -12376,6 +16406,10 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -12390,6 +16424,10 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 @@ -12410,11 +16448,13 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX6-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 ; GFX6-NEXT: s_mov_b32 m0, -1 @@ -12426,6 +16466,10 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX7-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12441,6 +16485,10 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -12454,6 +16502,10 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX10-CU-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -12467,6 +16519,10 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -12482,6 +16538,10 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -12495,6 +16555,10 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -12508,6 +16572,10 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 @@ -12521,6 +16589,10 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 @@ -12534,6 +16606,10 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX11-WGP-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -12547,6 +16623,10 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX11-CU-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -12560,6 +16640,10 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX12-WGP-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -12573,6 +16657,10 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -12587,6 +16675,10 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 @@ -12607,12 +16699,16 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_cmpxchg( ; GFX6-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12620,12 +16716,18 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_cmpxchg( ; ; GFX7-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12633,11 +16735,17 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12645,11 +16753,17 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12657,12 +16771,18 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12670,11 +16790,17 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12682,11 +16808,17 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12694,11 +16826,17 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12706,11 +16844,17 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12718,48 +16862,72 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -12767,12 +16935,18 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_cmpxchg( ; GFX1250-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -12787,12 +16961,16 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX6-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12800,12 +16978,18 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_cmpxchg( ; ; GFX7-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12813,11 +16997,17 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12825,11 +17015,17 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12837,12 +17033,18 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12850,11 +17052,17 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12862,11 +17070,17 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12874,11 +17088,17 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12886,11 +17106,17 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12898,48 +17124,72 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -12947,12 +17197,18 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX1250-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -12967,12 +17223,16 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_cmpxchg( ; GFX6-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12980,12 +17240,18 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_cmpxchg( ; ; GFX7-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -12993,11 +17259,17 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13005,11 +17277,17 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13017,12 +17295,18 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13030,11 +17314,17 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13042,11 +17332,17 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13054,11 +17350,17 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13066,11 +17368,17 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13078,48 +17386,72 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -13127,12 +17459,18 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_cmpxchg( ; GFX1250-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -13147,12 +17485,16 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX6-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13160,12 +17502,18 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX7-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13173,11 +17521,17 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13185,11 +17539,17 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13197,12 +17557,18 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13210,11 +17576,17 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13222,11 +17594,17 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13234,11 +17612,17 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13246,11 +17630,17 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13258,48 +17648,72 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -13307,12 +17721,18 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX1250-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -13327,12 +17747,16 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX6-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13340,12 +17764,18 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX7-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13353,11 +17783,17 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13365,11 +17801,17 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13377,12 +17819,18 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13390,11 +17838,17 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13402,11 +17856,17 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13414,11 +17874,17 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13426,11 +17892,17 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13438,48 +17910,72 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -13487,12 +17983,18 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX1250-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -13507,12 +18009,16 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX6-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13520,12 +18026,18 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_cmpxchg( ; ; GFX7-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13533,11 +18045,17 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13545,11 +18063,17 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13557,12 +18081,18 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13570,11 +18100,17 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13582,11 +18118,17 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13594,11 +18136,17 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13606,11 +18154,17 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13618,48 +18172,72 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -13667,12 +18245,18 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX1250-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -13687,12 +18271,16 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX6-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13700,12 +18288,18 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_cmpxchg( ; ; GFX7-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13713,11 +18307,17 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13725,11 +18325,17 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13737,12 +18343,18 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13750,11 +18362,17 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13762,11 +18380,17 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13774,11 +18398,17 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13786,11 +18416,17 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13798,48 +18434,72 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -13847,12 +18507,18 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX1250-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -13867,12 +18533,16 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_cmpxchg( ; GFX6-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13880,12 +18550,18 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_cmpxchg( ; ; GFX7-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13893,11 +18569,17 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13905,11 +18587,17 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13917,12 +18605,18 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13930,11 +18624,17 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13942,11 +18642,17 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13954,11 +18660,17 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13966,11 +18678,17 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -13978,48 +18696,72 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -14027,12 +18769,18 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_cmpxchg( ; GFX1250-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -14047,12 +18795,16 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX6-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14060,12 +18812,18 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_cmpxchg( ; ; GFX7-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14073,11 +18831,17 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14085,11 +18849,17 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14097,12 +18867,18 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14110,11 +18886,17 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14122,11 +18904,17 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14134,11 +18922,17 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14146,11 +18940,17 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14158,48 +18958,72 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -14207,12 +19031,18 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX1250-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -14227,12 +19057,16 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX6-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14240,12 +19074,18 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_cmpxchg( ; ; GFX7-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14253,11 +19093,17 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14265,11 +19111,17 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14277,12 +19129,18 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14290,11 +19148,17 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14302,11 +19166,17 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14314,11 +19184,17 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14326,11 +19202,17 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14338,48 +19220,72 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -14387,12 +19293,18 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX1250-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -14407,12 +19319,16 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX6-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14420,12 +19336,18 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14433,11 +19355,17 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14445,11 +19373,17 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14457,12 +19391,18 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14470,11 +19410,17 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14482,11 +19428,17 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14494,11 +19446,17 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14506,11 +19464,17 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14518,48 +19482,72 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -14567,12 +19555,18 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX1250-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -14587,12 +19581,16 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX6-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14600,12 +19598,18 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14613,11 +19617,17 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14625,11 +19635,17 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14637,12 +19653,18 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14650,11 +19672,17 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14662,11 +19690,17 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14674,11 +19708,17 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14686,11 +19726,17 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14698,48 +19744,72 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -14747,12 +19817,18 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX1250-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -14767,12 +19843,16 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX6-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14780,12 +19860,18 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14793,11 +19879,17 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14805,11 +19897,17 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14817,12 +19915,18 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14830,11 +19934,17 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14842,11 +19952,17 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14854,11 +19970,17 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14866,11 +19988,17 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14878,48 +20006,72 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -14927,12 +20079,18 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX1250-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -14947,12 +20105,16 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX6-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14960,12 +20122,18 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14973,11 +20141,17 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14985,11 +20159,17 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -14997,12 +20177,18 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -15010,11 +20196,17 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -15022,11 +20214,17 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -15034,11 +20232,17 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -15046,11 +20250,17 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -15058,48 +20268,72 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -15107,12 +20341,18 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX1250-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -15127,12 +20367,16 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX6-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -15140,12 +20384,18 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -15153,11 +20403,17 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -15165,11 +20421,17 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -15177,12 +20439,18 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -15190,11 +20458,17 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -15202,11 +20476,17 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -15214,11 +20494,17 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -15226,11 +20512,17 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 @@ -15238,48 +20530,72 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm @@ -15287,12 +20603,18 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX1250-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm @@ -15307,12 +20629,16 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_ret_cmpxch ; GFX6-LABEL: local_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -15325,6 +20651,12 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_ret_cmpxch ; GFX7-LABEL: local_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -15342,6 +20674,12 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_ret_cmpxch ; GFX10-WGP-LABEL: local_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -15357,6 +20695,12 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_ret_cmpxch ; GFX10-CU-LABEL: local_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -15372,6 +20716,12 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_ret_cmpxch ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -15389,6 +20739,12 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_ret_cmpxch ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15404,6 +20760,12 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_ret_cmpxch ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15419,6 +20781,12 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_ret_cmpxch ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15434,6 +20802,12 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_ret_cmpxch ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15449,6 +20823,12 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_ret_cmpxch ; GFX11-WGP-LABEL: local_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -15464,6 +20844,12 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_ret_cmpxch ; GFX11-CU-LABEL: local_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -15479,6 +20865,12 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_ret_cmpxch ; GFX12-WGP-LABEL: local_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -15494,6 +20886,12 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_ret_cmpxch ; GFX12-CU-LABEL: local_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -15510,6 +20908,12 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_ret_cmpxch ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -15534,12 +20938,16 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -15552,6 +20960,12 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; GFX7-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -15569,6 +20983,12 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; GFX10-WGP-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -15584,6 +21004,12 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; GFX10-CU-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -15599,6 +21025,12 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -15616,6 +21048,12 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15631,6 +21069,12 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15646,6 +21090,12 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15661,6 +21111,12 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15676,6 +21132,12 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -15691,6 +21153,12 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -15706,6 +21174,12 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -15721,6 +21195,12 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -15737,6 +21217,12 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -15761,12 +21247,16 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -15779,6 +21269,12 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX7-LABEL: local_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -15796,6 +21292,12 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX10-WGP-LABEL: local_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -15811,6 +21313,12 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX10-CU-LABEL: local_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -15826,6 +21334,12 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -15843,6 +21357,12 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15858,6 +21378,12 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15873,6 +21399,12 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15888,6 +21420,12 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15903,6 +21441,12 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: local_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -15918,6 +21462,12 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: local_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -15933,6 +21483,12 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -15948,6 +21504,12 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -15964,6 +21526,12 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -15988,12 +21556,16 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -16006,6 +21578,12 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX7-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -16023,6 +21601,12 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX10-WGP-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16038,6 +21622,12 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX10-CU-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16053,6 +21643,12 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -16070,6 +21666,12 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16085,6 +21687,12 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16100,6 +21708,12 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16115,6 +21729,12 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16130,6 +21750,12 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16145,6 +21771,12 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16160,6 +21792,12 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -16175,6 +21813,12 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -16191,6 +21835,12 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -16215,12 +21865,16 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -16233,6 +21887,12 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX7-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -16250,6 +21910,12 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16265,6 +21931,12 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX10-CU-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16280,6 +21952,12 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -16297,6 +21975,12 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16312,6 +21996,12 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16327,6 +22017,12 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16342,6 +22038,12 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16357,6 +22059,12 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16372,6 +22080,12 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16387,6 +22101,12 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -16402,6 +22122,12 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -16418,6 +22144,12 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -16442,12 +22174,16 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; GFX6-LABEL: local_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -16460,6 +22196,12 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; GFX7-LABEL: local_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -16477,6 +22219,12 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: local_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16492,6 +22240,12 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; GFX10-CU-LABEL: local_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16507,6 +22261,12 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -16524,6 +22284,12 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16539,6 +22305,12 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16554,6 +22326,12 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16569,6 +22347,12 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16584,6 +22368,12 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: local_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16599,6 +22389,12 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: local_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16614,6 +22410,12 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -16629,6 +22431,12 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -16645,6 +22453,12 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -16669,12 +22483,16 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX6-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -16687,6 +22505,12 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX7-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -16704,6 +22528,12 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16719,6 +22549,12 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX10-CU-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16734,6 +22570,12 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -16751,6 +22593,12 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16766,6 +22614,12 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16781,6 +22635,12 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16796,6 +22656,12 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16811,6 +22677,12 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16826,6 +22698,12 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16841,6 +22719,12 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -16856,6 +22740,12 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -16872,6 +22762,12 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -16896,12 +22792,16 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX6-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -16914,6 +22814,12 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX7-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -16931,6 +22837,12 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16946,6 +22858,12 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX10-CU-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16961,6 +22879,12 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -16978,6 +22902,12 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16993,6 +22923,12 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17008,6 +22944,12 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17023,6 +22965,12 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17038,6 +22986,12 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17053,6 +23007,12 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17068,6 +23028,12 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -17083,6 +23049,12 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -17099,6 +23071,12 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -17123,12 +23101,16 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX6-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -17141,6 +23123,12 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX7-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -17158,6 +23146,12 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17173,6 +23167,12 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX10-CU-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17188,6 +23188,12 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -17205,6 +23211,12 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17220,6 +23232,12 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17235,6 +23253,12 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17250,6 +23274,12 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17265,6 +23295,12 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17280,6 +23316,12 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17295,6 +23337,12 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -17310,6 +23358,12 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -17326,6 +23380,12 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -17350,12 +23410,16 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX6-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -17368,6 +23432,12 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX7-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -17385,6 +23455,12 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17400,6 +23476,12 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX10-CU-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17415,6 +23497,12 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -17432,6 +23520,12 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17447,6 +23541,12 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17462,6 +23562,12 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17477,6 +23583,12 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17492,6 +23604,12 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17507,6 +23625,12 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17522,6 +23646,12 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -17537,6 +23667,12 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -17553,6 +23689,12 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -17577,12 +23719,16 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -17595,6 +23741,12 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX7-LABEL: local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -17612,6 +23764,12 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX10-WGP-LABEL: local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17627,6 +23785,12 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX10-CU-LABEL: local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17642,6 +23806,12 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -17659,6 +23829,12 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17674,6 +23850,12 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17689,6 +23871,12 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17704,6 +23892,12 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17719,6 +23913,12 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17734,6 +23934,12 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17749,6 +23955,12 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -17764,6 +23976,12 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -17780,6 +23998,12 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -17804,12 +24028,16 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -17822,6 +24050,12 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX7-LABEL: local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -17839,6 +24073,12 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX10-WGP-LABEL: local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17854,6 +24094,12 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX10-CU-LABEL: local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17869,6 +24115,12 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -17886,6 +24138,12 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17901,6 +24159,12 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17916,6 +24180,12 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17931,6 +24201,12 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17946,6 +24222,12 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17961,6 +24243,12 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17976,6 +24264,12 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -17991,6 +24285,12 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -18007,6 +24307,12 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -18031,12 +24337,16 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -18049,6 +24359,12 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX7-LABEL: local_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -18066,6 +24382,12 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX10-WGP-LABEL: local_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -18081,6 +24403,12 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX10-CU-LABEL: local_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -18096,6 +24424,12 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -18113,6 +24447,12 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18128,6 +24468,12 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18143,6 +24489,12 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18158,6 +24510,12 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18173,6 +24531,12 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: local_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -18188,6 +24552,12 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: local_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -18203,6 +24573,12 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -18218,6 +24594,12 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -18234,6 +24616,12 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -18258,12 +24646,16 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -18276,6 +24668,12 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-LABEL: local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -18293,6 +24691,12 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-WGP-LABEL: local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -18308,6 +24712,12 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-CU-LABEL: local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -18323,6 +24733,12 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -18340,6 +24756,12 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18355,6 +24777,12 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18370,6 +24798,12 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18385,6 +24819,12 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18400,6 +24840,12 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -18415,6 +24861,12 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -18430,6 +24882,12 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -18445,6 +24903,12 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -18461,6 +24925,12 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -18485,12 +24955,16 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 @@ -18503,6 +24977,12 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-LABEL: local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -18520,6 +25000,12 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -18535,6 +25021,12 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-CU-LABEL: local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -18550,6 +25042,12 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -18567,6 +25065,12 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18582,6 +25086,12 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18597,6 +25107,12 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18612,6 +25128,12 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18627,6 +25149,12 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -18642,6 +25170,12 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -18657,6 +25191,12 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -18672,6 +25212,12 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -18688,6 +25234,12 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-agent.ll index 220bc97a6822..1b511c6fca4a 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-agent.ll @@ -19,11 +19,15 @@ define amdgpu_kernel void @private_agent_unordered_load( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -33,11 +37,15 @@ define amdgpu_kernel void @private_agent_unordered_load( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -47,11 +55,15 @@ define amdgpu_kernel void @private_agent_unordered_load( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -61,11 +73,15 @@ define amdgpu_kernel void @private_agent_unordered_load( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -79,11 +95,15 @@ define amdgpu_kernel void @private_agent_unordered_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -93,11 +113,15 @@ define amdgpu_kernel void @private_agent_unordered_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -107,11 +131,15 @@ define amdgpu_kernel void @private_agent_unordered_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -119,84 +147,108 @@ define amdgpu_kernel void @private_agent_unordered_load( ; ; GFX942-NOTTGSPLIT-LABEL: private_agent_unordered_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_agent_unordered_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_agent_unordered_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_agent_unordered_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_agent_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_agent_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: private_agent_unordered_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -205,6 +257,7 @@ define amdgpu_kernel void @private_agent_unordered_load( ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: flat_load_b32 v0, v[0:1] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %in, ptr addrspace(5) %out) { @@ -219,11 +272,15 @@ define amdgpu_kernel void @private_agent_monotonic_load( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -233,11 +290,15 @@ define amdgpu_kernel void @private_agent_monotonic_load( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -247,11 +308,15 @@ define amdgpu_kernel void @private_agent_monotonic_load( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -261,11 +326,15 @@ define amdgpu_kernel void @private_agent_monotonic_load( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -279,11 +348,15 @@ define amdgpu_kernel void @private_agent_monotonic_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -293,11 +366,15 @@ define amdgpu_kernel void @private_agent_monotonic_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -307,11 +384,15 @@ define amdgpu_kernel void @private_agent_monotonic_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -319,84 +400,108 @@ define amdgpu_kernel void @private_agent_monotonic_load( ; ; GFX942-NOTTGSPLIT-LABEL: private_agent_monotonic_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_agent_monotonic_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_agent_monotonic_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_agent_monotonic_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_agent_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_agent_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: private_agent_monotonic_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -405,6 +510,7 @@ define amdgpu_kernel void @private_agent_monotonic_load( ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %in, ptr addrspace(5) %out) { @@ -419,11 +525,15 @@ define amdgpu_kernel void @private_agent_acquire_load( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -433,11 +543,15 @@ define amdgpu_kernel void @private_agent_acquire_load( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -447,11 +561,15 @@ define amdgpu_kernel void @private_agent_acquire_load( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -461,11 +579,15 @@ define amdgpu_kernel void @private_agent_acquire_load( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -479,11 +601,15 @@ define amdgpu_kernel void @private_agent_acquire_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -493,11 +619,15 @@ define amdgpu_kernel void @private_agent_acquire_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -507,11 +637,15 @@ define amdgpu_kernel void @private_agent_acquire_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -519,84 +653,108 @@ define amdgpu_kernel void @private_agent_acquire_load( ; ; GFX942-NOTTGSPLIT-LABEL: private_agent_acquire_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_agent_acquire_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_agent_acquire_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_agent_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_agent_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_agent_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: private_agent_acquire_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -607,6 +765,7 @@ define amdgpu_kernel void @private_agent_acquire_load( ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %in, ptr addrspace(5) %out) { @@ -621,11 +780,15 @@ define amdgpu_kernel void @private_agent_seq_cst_load( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -635,11 +798,15 @@ define amdgpu_kernel void @private_agent_seq_cst_load( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -649,11 +816,15 @@ define amdgpu_kernel void @private_agent_seq_cst_load( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -663,11 +834,15 @@ define amdgpu_kernel void @private_agent_seq_cst_load( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -681,11 +856,15 @@ define amdgpu_kernel void @private_agent_seq_cst_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -695,11 +874,15 @@ define amdgpu_kernel void @private_agent_seq_cst_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -709,11 +892,15 @@ define amdgpu_kernel void @private_agent_seq_cst_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -721,84 +908,108 @@ define amdgpu_kernel void @private_agent_seq_cst_load( ; ; GFX942-NOTTGSPLIT-LABEL: private_agent_seq_cst_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_agent_seq_cst_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_agent_seq_cst_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_agent_seq_cst_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_agent_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_agent_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: private_agent_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -811,6 +1022,7 @@ define amdgpu_kernel void @private_agent_seq_cst_load( ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %in, ptr addrspace(5) %out) { @@ -825,10 +1037,14 @@ define amdgpu_kernel void @private_agent_unordered_store( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX6-NEXT: s_endpgm @@ -837,10 +1053,14 @@ define amdgpu_kernel void @private_agent_unordered_store( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX7-NEXT: s_endpgm @@ -849,10 +1069,14 @@ define amdgpu_kernel void @private_agent_unordered_store( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-WGP-NEXT: s_endpgm @@ -861,10 +1085,14 @@ define amdgpu_kernel void @private_agent_unordered_store( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-CU-NEXT: s_endpgm @@ -877,10 +1105,14 @@ define amdgpu_kernel void @private_agent_unordered_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -889,10 +1121,14 @@ define amdgpu_kernel void @private_agent_unordered_store( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -901,65 +1137,93 @@ define amdgpu_kernel void @private_agent_unordered_store( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: private_agent_unordered_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_agent_unordered_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_agent_unordered_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_agent_unordered_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_agent_unordered_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_agent_unordered_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; @@ -967,22 +1231,26 @@ define amdgpu_kernel void @private_agent_unordered_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -1004,10 +1272,14 @@ define amdgpu_kernel void @private_agent_monotonic_store( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX6-NEXT: s_endpgm @@ -1016,10 +1288,14 @@ define amdgpu_kernel void @private_agent_monotonic_store( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX7-NEXT: s_endpgm @@ -1028,10 +1304,14 @@ define amdgpu_kernel void @private_agent_monotonic_store( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-WGP-NEXT: s_endpgm @@ -1040,10 +1320,14 @@ define amdgpu_kernel void @private_agent_monotonic_store( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-CU-NEXT: s_endpgm @@ -1056,10 +1340,14 @@ define amdgpu_kernel void @private_agent_monotonic_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -1068,10 +1356,14 @@ define amdgpu_kernel void @private_agent_monotonic_store( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -1080,65 +1372,93 @@ define amdgpu_kernel void @private_agent_monotonic_store( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: private_agent_monotonic_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_agent_monotonic_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_agent_monotonic_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_agent_monotonic_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_agent_monotonic_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_agent_monotonic_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; @@ -1146,22 +1466,26 @@ define amdgpu_kernel void @private_agent_monotonic_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -1183,10 +1507,14 @@ define amdgpu_kernel void @private_agent_release_store( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX6-NEXT: s_endpgm @@ -1195,10 +1523,14 @@ define amdgpu_kernel void @private_agent_release_store( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX7-NEXT: s_endpgm @@ -1207,10 +1539,14 @@ define amdgpu_kernel void @private_agent_release_store( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-WGP-NEXT: s_endpgm @@ -1219,10 +1555,14 @@ define amdgpu_kernel void @private_agent_release_store( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-CU-NEXT: s_endpgm @@ -1235,10 +1575,14 @@ define amdgpu_kernel void @private_agent_release_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -1247,10 +1591,14 @@ define amdgpu_kernel void @private_agent_release_store( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -1259,65 +1607,93 @@ define amdgpu_kernel void @private_agent_release_store( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: private_agent_release_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_agent_release_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_agent_release_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_agent_release_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_agent_release_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_agent_release_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; @@ -1325,22 +1701,26 @@ define amdgpu_kernel void @private_agent_release_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -1367,10 +1747,14 @@ define amdgpu_kernel void @private_agent_seq_cst_store( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX6-NEXT: s_endpgm @@ -1379,10 +1763,14 @@ define amdgpu_kernel void @private_agent_seq_cst_store( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX7-NEXT: s_endpgm @@ -1391,10 +1779,14 @@ define amdgpu_kernel void @private_agent_seq_cst_store( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-WGP-NEXT: s_endpgm @@ -1403,10 +1795,14 @@ define amdgpu_kernel void @private_agent_seq_cst_store( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-CU-NEXT: s_endpgm @@ -1419,10 +1815,14 @@ define amdgpu_kernel void @private_agent_seq_cst_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -1431,10 +1831,14 @@ define amdgpu_kernel void @private_agent_seq_cst_store( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -1443,65 +1847,93 @@ define amdgpu_kernel void @private_agent_seq_cst_store( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: private_agent_seq_cst_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_agent_seq_cst_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_agent_seq_cst_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_agent_seq_cst_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_agent_seq_cst_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_agent_seq_cst_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; @@ -1509,22 +1941,26 @@ define amdgpu_kernel void @private_agent_seq_cst_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -1552,8 +1988,15 @@ define amdgpu_kernel void @private_agent_monotonic_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1564,8 +2007,15 @@ define amdgpu_kernel void @private_agent_monotonic_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1576,8 +2026,15 @@ define amdgpu_kernel void @private_agent_monotonic_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1588,8 +2045,15 @@ define amdgpu_kernel void @private_agent_monotonic_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1604,8 +2068,15 @@ define amdgpu_kernel void @private_agent_monotonic_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -1616,8 +2087,15 @@ define amdgpu_kernel void @private_agent_monotonic_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1628,8 +2106,15 @@ define amdgpu_kernel void @private_agent_monotonic_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1638,8 +2123,14 @@ define amdgpu_kernel void @private_agent_monotonic_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_agent_monotonic_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -1647,8 +2138,14 @@ define amdgpu_kernel void @private_agent_monotonic_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_agent_monotonic_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm @@ -1656,8 +2153,14 @@ define amdgpu_kernel void @private_agent_monotonic_atomicrmw( ; GFX11-WGP-LABEL: private_agent_monotonic_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm @@ -1665,8 +2168,14 @@ define amdgpu_kernel void @private_agent_monotonic_atomicrmw( ; GFX11-CU-LABEL: private_agent_monotonic_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm @@ -1674,8 +2183,14 @@ define amdgpu_kernel void @private_agent_monotonic_atomicrmw( ; GFX12-WGP-LABEL: private_agent_monotonic_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm @@ -1683,8 +2198,14 @@ define amdgpu_kernel void @private_agent_monotonic_atomicrmw( ; GFX12-CU-LABEL: private_agent_monotonic_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm @@ -1692,29 +2213,34 @@ define amdgpu_kernel void @private_agent_monotonic_atomicrmw( ; GFX1250-LABEL: private_agent_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 ; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV @@ -1731,8 +2257,15 @@ define amdgpu_kernel void @private_agent_acquire_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1743,8 +2276,15 @@ define amdgpu_kernel void @private_agent_acquire_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1755,8 +2295,15 @@ define amdgpu_kernel void @private_agent_acquire_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1767,8 +2314,15 @@ define amdgpu_kernel void @private_agent_acquire_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1783,8 +2337,15 @@ define amdgpu_kernel void @private_agent_acquire_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -1795,8 +2356,15 @@ define amdgpu_kernel void @private_agent_acquire_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1807,8 +2375,15 @@ define amdgpu_kernel void @private_agent_acquire_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1817,8 +2392,14 @@ define amdgpu_kernel void @private_agent_acquire_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_agent_acquire_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -1826,8 +2407,14 @@ define amdgpu_kernel void @private_agent_acquire_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_agent_acquire_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm @@ -1835,8 +2422,14 @@ define amdgpu_kernel void @private_agent_acquire_atomicrmw( ; GFX11-WGP-LABEL: private_agent_acquire_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm @@ -1844,8 +2437,14 @@ define amdgpu_kernel void @private_agent_acquire_atomicrmw( ; GFX11-CU-LABEL: private_agent_acquire_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm @@ -1853,8 +2452,14 @@ define amdgpu_kernel void @private_agent_acquire_atomicrmw( ; GFX12-WGP-LABEL: private_agent_acquire_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm @@ -1862,8 +2467,14 @@ define amdgpu_kernel void @private_agent_acquire_atomicrmw( ; GFX12-CU-LABEL: private_agent_acquire_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm @@ -1871,29 +2482,34 @@ define amdgpu_kernel void @private_agent_acquire_atomicrmw( ; GFX1250-LABEL: private_agent_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 ; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV @@ -1913,8 +2529,15 @@ define amdgpu_kernel void @private_agent_release_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1925,8 +2548,15 @@ define amdgpu_kernel void @private_agent_release_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1937,8 +2567,15 @@ define amdgpu_kernel void @private_agent_release_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1949,8 +2586,15 @@ define amdgpu_kernel void @private_agent_release_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1965,8 +2609,15 @@ define amdgpu_kernel void @private_agent_release_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -1977,8 +2628,15 @@ define amdgpu_kernel void @private_agent_release_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1989,8 +2647,15 @@ define amdgpu_kernel void @private_agent_release_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1999,8 +2664,14 @@ define amdgpu_kernel void @private_agent_release_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_agent_release_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -2008,8 +2679,14 @@ define amdgpu_kernel void @private_agent_release_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_agent_release_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm @@ -2017,8 +2694,14 @@ define amdgpu_kernel void @private_agent_release_atomicrmw( ; GFX11-WGP-LABEL: private_agent_release_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm @@ -2026,8 +2709,14 @@ define amdgpu_kernel void @private_agent_release_atomicrmw( ; GFX11-CU-LABEL: private_agent_release_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm @@ -2035,8 +2724,14 @@ define amdgpu_kernel void @private_agent_release_atomicrmw( ; GFX12-WGP-LABEL: private_agent_release_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm @@ -2044,8 +2739,14 @@ define amdgpu_kernel void @private_agent_release_atomicrmw( ; GFX12-CU-LABEL: private_agent_release_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm @@ -2053,29 +2754,34 @@ define amdgpu_kernel void @private_agent_release_atomicrmw( ; GFX1250-LABEL: private_agent_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 ; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -2097,8 +2803,15 @@ define amdgpu_kernel void @private_agent_acq_rel_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -2109,8 +2822,15 @@ define amdgpu_kernel void @private_agent_acq_rel_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -2121,8 +2841,15 @@ define amdgpu_kernel void @private_agent_acq_rel_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -2133,8 +2860,15 @@ define amdgpu_kernel void @private_agent_acq_rel_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -2149,8 +2883,15 @@ define amdgpu_kernel void @private_agent_acq_rel_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -2161,8 +2902,15 @@ define amdgpu_kernel void @private_agent_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -2173,8 +2921,15 @@ define amdgpu_kernel void @private_agent_acq_rel_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -2183,8 +2938,14 @@ define amdgpu_kernel void @private_agent_acq_rel_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_agent_acq_rel_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -2192,8 +2953,14 @@ define amdgpu_kernel void @private_agent_acq_rel_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_agent_acq_rel_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm @@ -2201,8 +2968,14 @@ define amdgpu_kernel void @private_agent_acq_rel_atomicrmw( ; GFX11-WGP-LABEL: private_agent_acq_rel_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm @@ -2210,8 +2983,14 @@ define amdgpu_kernel void @private_agent_acq_rel_atomicrmw( ; GFX11-CU-LABEL: private_agent_acq_rel_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm @@ -2219,8 +2998,14 @@ define amdgpu_kernel void @private_agent_acq_rel_atomicrmw( ; GFX12-WGP-LABEL: private_agent_acq_rel_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm @@ -2228,8 +3013,14 @@ define amdgpu_kernel void @private_agent_acq_rel_atomicrmw( ; GFX12-CU-LABEL: private_agent_acq_rel_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm @@ -2237,29 +3028,34 @@ define amdgpu_kernel void @private_agent_acq_rel_atomicrmw( ; GFX1250-LABEL: private_agent_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 ; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -2284,8 +3080,15 @@ define amdgpu_kernel void @private_agent_seq_cst_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -2296,8 +3099,15 @@ define amdgpu_kernel void @private_agent_seq_cst_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -2308,8 +3118,15 @@ define amdgpu_kernel void @private_agent_seq_cst_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -2320,8 +3137,15 @@ define amdgpu_kernel void @private_agent_seq_cst_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -2336,8 +3160,15 @@ define amdgpu_kernel void @private_agent_seq_cst_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -2348,8 +3179,15 @@ define amdgpu_kernel void @private_agent_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -2360,8 +3198,15 @@ define amdgpu_kernel void @private_agent_seq_cst_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -2370,8 +3215,14 @@ define amdgpu_kernel void @private_agent_seq_cst_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_agent_seq_cst_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -2379,8 +3230,14 @@ define amdgpu_kernel void @private_agent_seq_cst_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_agent_seq_cst_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm @@ -2388,8 +3245,14 @@ define amdgpu_kernel void @private_agent_seq_cst_atomicrmw( ; GFX11-WGP-LABEL: private_agent_seq_cst_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm @@ -2397,8 +3260,14 @@ define amdgpu_kernel void @private_agent_seq_cst_atomicrmw( ; GFX11-CU-LABEL: private_agent_seq_cst_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm @@ -2406,8 +3275,14 @@ define amdgpu_kernel void @private_agent_seq_cst_atomicrmw( ; GFX12-WGP-LABEL: private_agent_seq_cst_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm @@ -2415,8 +3290,14 @@ define amdgpu_kernel void @private_agent_seq_cst_atomicrmw( ; GFX12-CU-LABEL: private_agent_seq_cst_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm @@ -2424,29 +3305,34 @@ define amdgpu_kernel void @private_agent_seq_cst_atomicrmw( ; GFX1250-LABEL: private_agent_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 ; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -2471,6 +3357,10 @@ define amdgpu_kernel void @private_agent_acquire_ret_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -2489,6 +3379,10 @@ define amdgpu_kernel void @private_agent_acquire_ret_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -2506,6 +3400,10 @@ define amdgpu_kernel void @private_agent_acquire_ret_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -2523,6 +3421,10 @@ define amdgpu_kernel void @private_agent_acquire_ret_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -2544,6 +3446,10 @@ define amdgpu_kernel void @private_agent_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -2561,6 +3467,10 @@ define amdgpu_kernel void @private_agent_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -2578,6 +3488,10 @@ define amdgpu_kernel void @private_agent_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -2593,6 +3507,10 @@ define amdgpu_kernel void @private_agent_acquire_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_agent_acquire_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 @@ -2605,6 +3523,10 @@ define amdgpu_kernel void @private_agent_acquire_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_agent_acquire_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 @@ -2617,6 +3539,10 @@ define amdgpu_kernel void @private_agent_acquire_ret_atomicrmw( ; GFX11-WGP-LABEL: private_agent_acquire_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 @@ -2629,6 +3555,10 @@ define amdgpu_kernel void @private_agent_acquire_ret_atomicrmw( ; GFX11-CU-LABEL: private_agent_acquire_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 @@ -2641,6 +3571,10 @@ define amdgpu_kernel void @private_agent_acquire_ret_atomicrmw( ; GFX12-WGP-LABEL: private_agent_acquire_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 @@ -2653,6 +3587,10 @@ define amdgpu_kernel void @private_agent_acquire_ret_atomicrmw( ; GFX12-CU-LABEL: private_agent_acquire_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 @@ -2666,22 +3604,26 @@ define amdgpu_kernel void @private_agent_acquire_ret_atomicrmw( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_lg_u32 s0, s2 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b32 s2, 0 ; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s0, s3 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -2709,6 +3651,10 @@ define amdgpu_kernel void @private_agent_acq_rel_ret_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -2727,6 +3673,10 @@ define amdgpu_kernel void @private_agent_acq_rel_ret_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -2744,6 +3694,10 @@ define amdgpu_kernel void @private_agent_acq_rel_ret_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -2761,6 +3715,10 @@ define amdgpu_kernel void @private_agent_acq_rel_ret_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -2782,6 +3740,10 @@ define amdgpu_kernel void @private_agent_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -2799,6 +3761,10 @@ define amdgpu_kernel void @private_agent_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -2816,6 +3782,10 @@ define amdgpu_kernel void @private_agent_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -2831,6 +3801,10 @@ define amdgpu_kernel void @private_agent_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_agent_acq_rel_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 @@ -2843,6 +3817,10 @@ define amdgpu_kernel void @private_agent_acq_rel_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_agent_acq_rel_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 @@ -2855,6 +3833,10 @@ define amdgpu_kernel void @private_agent_acq_rel_ret_atomicrmw( ; GFX11-WGP-LABEL: private_agent_acq_rel_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 @@ -2867,6 +3849,10 @@ define amdgpu_kernel void @private_agent_acq_rel_ret_atomicrmw( ; GFX11-CU-LABEL: private_agent_acq_rel_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 @@ -2879,6 +3865,10 @@ define amdgpu_kernel void @private_agent_acq_rel_ret_atomicrmw( ; GFX12-WGP-LABEL: private_agent_acq_rel_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 @@ -2891,6 +3881,10 @@ define amdgpu_kernel void @private_agent_acq_rel_ret_atomicrmw( ; GFX12-CU-LABEL: private_agent_acq_rel_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 @@ -2904,22 +3898,26 @@ define amdgpu_kernel void @private_agent_acq_rel_ret_atomicrmw( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_lg_u32 s0, s2 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b32 s2, 0 ; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s0, s3 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -2952,6 +3950,10 @@ define amdgpu_kernel void @private_agent_seq_cst_ret_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -2970,6 +3972,10 @@ define amdgpu_kernel void @private_agent_seq_cst_ret_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -2987,6 +3993,10 @@ define amdgpu_kernel void @private_agent_seq_cst_ret_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -3004,6 +4014,10 @@ define amdgpu_kernel void @private_agent_seq_cst_ret_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -3025,6 +4039,10 @@ define amdgpu_kernel void @private_agent_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -3042,6 +4060,10 @@ define amdgpu_kernel void @private_agent_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -3059,6 +4081,10 @@ define amdgpu_kernel void @private_agent_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -3074,6 +4100,10 @@ define amdgpu_kernel void @private_agent_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_agent_seq_cst_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 @@ -3086,6 +4116,10 @@ define amdgpu_kernel void @private_agent_seq_cst_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_agent_seq_cst_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 @@ -3098,6 +4132,10 @@ define amdgpu_kernel void @private_agent_seq_cst_ret_atomicrmw( ; GFX11-WGP-LABEL: private_agent_seq_cst_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 @@ -3110,6 +4148,10 @@ define amdgpu_kernel void @private_agent_seq_cst_ret_atomicrmw( ; GFX11-CU-LABEL: private_agent_seq_cst_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 @@ -3122,6 +4164,10 @@ define amdgpu_kernel void @private_agent_seq_cst_ret_atomicrmw( ; GFX12-WGP-LABEL: private_agent_seq_cst_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 @@ -3134,6 +4180,10 @@ define amdgpu_kernel void @private_agent_seq_cst_ret_atomicrmw( ; GFX12-CU-LABEL: private_agent_seq_cst_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 @@ -3147,22 +4197,26 @@ define amdgpu_kernel void @private_agent_seq_cst_ret_atomicrmw( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_lg_u32 s0, s2 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b32 s2, 0 ; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s0, s3 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -3195,6 +4249,12 @@ define amdgpu_kernel void @private_agent_monotonic_monotonic_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -3215,6 +4275,12 @@ define amdgpu_kernel void @private_agent_monotonic_monotonic_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -3235,6 +4301,12 @@ define amdgpu_kernel void @private_agent_monotonic_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -3252,6 +4324,12 @@ define amdgpu_kernel void @private_agent_monotonic_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -3273,6 +4351,12 @@ define amdgpu_kernel void @private_agent_monotonic_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -3293,6 +4377,12 @@ define amdgpu_kernel void @private_agent_monotonic_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3311,6 +4401,12 @@ define amdgpu_kernel void @private_agent_monotonic_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3327,6 +4423,12 @@ define amdgpu_kernel void @private_agent_monotonic_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_agent_monotonic_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -3344,6 +4446,12 @@ define amdgpu_kernel void @private_agent_monotonic_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_agent_monotonic_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -3361,6 +4469,12 @@ define amdgpu_kernel void @private_agent_monotonic_monotonic_cmpxchg( ; GFX11-WGP-LABEL: private_agent_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -3376,6 +4490,12 @@ define amdgpu_kernel void @private_agent_monotonic_monotonic_cmpxchg( ; GFX11-CU-LABEL: private_agent_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -3391,6 +4511,12 @@ define amdgpu_kernel void @private_agent_monotonic_monotonic_cmpxchg( ; GFX12-WGP-LABEL: private_agent_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -3405,6 +4531,12 @@ define amdgpu_kernel void @private_agent_monotonic_monotonic_cmpxchg( ; GFX12-CU-LABEL: private_agent_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -3419,26 +4551,32 @@ define amdgpu_kernel void @private_agent_monotonic_monotonic_cmpxchg( ; GFX1250-LABEL: private_agent_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -3446,6 +4584,7 @@ define amdgpu_kernel void @private_agent_monotonic_monotonic_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -3465,6 +4604,12 @@ define amdgpu_kernel void @private_agent_acquire_monotonic_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -3485,6 +4630,12 @@ define amdgpu_kernel void @private_agent_acquire_monotonic_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -3505,6 +4656,12 @@ define amdgpu_kernel void @private_agent_acquire_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -3522,6 +4679,12 @@ define amdgpu_kernel void @private_agent_acquire_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -3543,6 +4706,12 @@ define amdgpu_kernel void @private_agent_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -3563,6 +4732,12 @@ define amdgpu_kernel void @private_agent_acquire_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3581,6 +4756,12 @@ define amdgpu_kernel void @private_agent_acquire_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3597,6 +4778,12 @@ define amdgpu_kernel void @private_agent_acquire_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_agent_acquire_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -3614,6 +4801,12 @@ define amdgpu_kernel void @private_agent_acquire_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_agent_acquire_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -3631,6 +4824,12 @@ define amdgpu_kernel void @private_agent_acquire_monotonic_cmpxchg( ; GFX11-WGP-LABEL: private_agent_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -3646,6 +4845,12 @@ define amdgpu_kernel void @private_agent_acquire_monotonic_cmpxchg( ; GFX11-CU-LABEL: private_agent_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -3661,6 +4866,12 @@ define amdgpu_kernel void @private_agent_acquire_monotonic_cmpxchg( ; GFX12-WGP-LABEL: private_agent_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -3675,6 +4886,12 @@ define amdgpu_kernel void @private_agent_acquire_monotonic_cmpxchg( ; GFX12-CU-LABEL: private_agent_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -3689,26 +4906,32 @@ define amdgpu_kernel void @private_agent_acquire_monotonic_cmpxchg( ; GFX1250-LABEL: private_agent_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -3716,6 +4939,7 @@ define amdgpu_kernel void @private_agent_acquire_monotonic_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -3738,6 +4962,12 @@ define amdgpu_kernel void @private_agent_release_monotonic_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -3758,6 +4988,12 @@ define amdgpu_kernel void @private_agent_release_monotonic_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -3778,6 +5014,12 @@ define amdgpu_kernel void @private_agent_release_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -3795,6 +5037,12 @@ define amdgpu_kernel void @private_agent_release_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -3816,6 +5064,12 @@ define amdgpu_kernel void @private_agent_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -3836,6 +5090,12 @@ define amdgpu_kernel void @private_agent_release_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3854,6 +5114,12 @@ define amdgpu_kernel void @private_agent_release_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3870,6 +5136,12 @@ define amdgpu_kernel void @private_agent_release_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_agent_release_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -3887,6 +5159,12 @@ define amdgpu_kernel void @private_agent_release_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_agent_release_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -3904,6 +5182,12 @@ define amdgpu_kernel void @private_agent_release_monotonic_cmpxchg( ; GFX11-WGP-LABEL: private_agent_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -3919,6 +5203,12 @@ define amdgpu_kernel void @private_agent_release_monotonic_cmpxchg( ; GFX11-CU-LABEL: private_agent_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -3934,6 +5224,12 @@ define amdgpu_kernel void @private_agent_release_monotonic_cmpxchg( ; GFX12-WGP-LABEL: private_agent_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -3948,6 +5244,12 @@ define amdgpu_kernel void @private_agent_release_monotonic_cmpxchg( ; GFX12-CU-LABEL: private_agent_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -3962,26 +5264,32 @@ define amdgpu_kernel void @private_agent_release_monotonic_cmpxchg( ; GFX1250-LABEL: private_agent_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -3989,6 +5297,7 @@ define amdgpu_kernel void @private_agent_release_monotonic_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -4013,6 +5322,12 @@ define amdgpu_kernel void @private_agent_acq_rel_monotonic_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -4033,6 +5348,12 @@ define amdgpu_kernel void @private_agent_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -4053,6 +5374,12 @@ define amdgpu_kernel void @private_agent_acq_rel_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4070,6 +5397,12 @@ define amdgpu_kernel void @private_agent_acq_rel_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4091,6 +5424,12 @@ define amdgpu_kernel void @private_agent_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -4111,6 +5450,12 @@ define amdgpu_kernel void @private_agent_acq_rel_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4129,6 +5474,12 @@ define amdgpu_kernel void @private_agent_acq_rel_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4145,6 +5496,12 @@ define amdgpu_kernel void @private_agent_acq_rel_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_agent_acq_rel_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -4162,6 +5519,12 @@ define amdgpu_kernel void @private_agent_acq_rel_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_agent_acq_rel_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -4179,6 +5542,12 @@ define amdgpu_kernel void @private_agent_acq_rel_monotonic_cmpxchg( ; GFX11-WGP-LABEL: private_agent_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -4194,6 +5563,12 @@ define amdgpu_kernel void @private_agent_acq_rel_monotonic_cmpxchg( ; GFX11-CU-LABEL: private_agent_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -4209,6 +5584,12 @@ define amdgpu_kernel void @private_agent_acq_rel_monotonic_cmpxchg( ; GFX12-WGP-LABEL: private_agent_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -4223,6 +5604,12 @@ define amdgpu_kernel void @private_agent_acq_rel_monotonic_cmpxchg( ; GFX12-CU-LABEL: private_agent_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -4237,26 +5624,32 @@ define amdgpu_kernel void @private_agent_acq_rel_monotonic_cmpxchg( ; GFX1250-LABEL: private_agent_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -4264,6 +5657,7 @@ define amdgpu_kernel void @private_agent_acq_rel_monotonic_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -4291,6 +5685,12 @@ define amdgpu_kernel void @private_agent_seq_cst_monotonic_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -4311,6 +5711,12 @@ define amdgpu_kernel void @private_agent_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -4331,6 +5737,12 @@ define amdgpu_kernel void @private_agent_seq_cst_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4348,6 +5760,12 @@ define amdgpu_kernel void @private_agent_seq_cst_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4369,6 +5787,12 @@ define amdgpu_kernel void @private_agent_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -4389,6 +5813,12 @@ define amdgpu_kernel void @private_agent_seq_cst_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4407,6 +5837,12 @@ define amdgpu_kernel void @private_agent_seq_cst_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4423,6 +5859,12 @@ define amdgpu_kernel void @private_agent_seq_cst_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_agent_seq_cst_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -4440,6 +5882,12 @@ define amdgpu_kernel void @private_agent_seq_cst_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_agent_seq_cst_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -4457,6 +5905,12 @@ define amdgpu_kernel void @private_agent_seq_cst_monotonic_cmpxchg( ; GFX11-WGP-LABEL: private_agent_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -4472,6 +5926,12 @@ define amdgpu_kernel void @private_agent_seq_cst_monotonic_cmpxchg( ; GFX11-CU-LABEL: private_agent_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -4487,6 +5947,12 @@ define amdgpu_kernel void @private_agent_seq_cst_monotonic_cmpxchg( ; GFX12-WGP-LABEL: private_agent_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -4501,6 +5967,12 @@ define amdgpu_kernel void @private_agent_seq_cst_monotonic_cmpxchg( ; GFX12-CU-LABEL: private_agent_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -4515,26 +5987,32 @@ define amdgpu_kernel void @private_agent_seq_cst_monotonic_cmpxchg( ; GFX1250-LABEL: private_agent_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -4542,6 +6020,7 @@ define amdgpu_kernel void @private_agent_seq_cst_monotonic_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -4569,6 +6048,12 @@ define amdgpu_kernel void @private_agent_monotonic_acquire_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -4589,6 +6074,12 @@ define amdgpu_kernel void @private_agent_monotonic_acquire_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -4609,6 +6100,12 @@ define amdgpu_kernel void @private_agent_monotonic_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4626,6 +6123,12 @@ define amdgpu_kernel void @private_agent_monotonic_acquire_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4647,6 +6150,12 @@ define amdgpu_kernel void @private_agent_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -4667,6 +6176,12 @@ define amdgpu_kernel void @private_agent_monotonic_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4685,6 +6200,12 @@ define amdgpu_kernel void @private_agent_monotonic_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4701,6 +6222,12 @@ define amdgpu_kernel void @private_agent_monotonic_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_agent_monotonic_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -4718,6 +6245,12 @@ define amdgpu_kernel void @private_agent_monotonic_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_agent_monotonic_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -4735,6 +6268,12 @@ define amdgpu_kernel void @private_agent_monotonic_acquire_cmpxchg( ; GFX11-WGP-LABEL: private_agent_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -4750,6 +6289,12 @@ define amdgpu_kernel void @private_agent_monotonic_acquire_cmpxchg( ; GFX11-CU-LABEL: private_agent_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -4765,6 +6310,12 @@ define amdgpu_kernel void @private_agent_monotonic_acquire_cmpxchg( ; GFX12-WGP-LABEL: private_agent_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -4779,6 +6330,12 @@ define amdgpu_kernel void @private_agent_monotonic_acquire_cmpxchg( ; GFX12-CU-LABEL: private_agent_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -4793,26 +6350,32 @@ define amdgpu_kernel void @private_agent_monotonic_acquire_cmpxchg( ; GFX1250-LABEL: private_agent_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -4820,6 +6383,7 @@ define amdgpu_kernel void @private_agent_monotonic_acquire_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -4842,6 +6406,12 @@ define amdgpu_kernel void @private_agent_acquire_acquire_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -4862,6 +6432,12 @@ define amdgpu_kernel void @private_agent_acquire_acquire_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -4882,6 +6458,12 @@ define amdgpu_kernel void @private_agent_acquire_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4899,6 +6481,12 @@ define amdgpu_kernel void @private_agent_acquire_acquire_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4920,6 +6508,12 @@ define amdgpu_kernel void @private_agent_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -4940,6 +6534,12 @@ define amdgpu_kernel void @private_agent_acquire_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4958,6 +6558,12 @@ define amdgpu_kernel void @private_agent_acquire_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4974,6 +6580,12 @@ define amdgpu_kernel void @private_agent_acquire_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_agent_acquire_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -4991,6 +6603,12 @@ define amdgpu_kernel void @private_agent_acquire_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_agent_acquire_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -5008,6 +6626,12 @@ define amdgpu_kernel void @private_agent_acquire_acquire_cmpxchg( ; GFX11-WGP-LABEL: private_agent_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -5023,6 +6647,12 @@ define amdgpu_kernel void @private_agent_acquire_acquire_cmpxchg( ; GFX11-CU-LABEL: private_agent_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -5038,6 +6668,12 @@ define amdgpu_kernel void @private_agent_acquire_acquire_cmpxchg( ; GFX12-WGP-LABEL: private_agent_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -5052,6 +6688,12 @@ define amdgpu_kernel void @private_agent_acquire_acquire_cmpxchg( ; GFX12-CU-LABEL: private_agent_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -5066,26 +6708,32 @@ define amdgpu_kernel void @private_agent_acquire_acquire_cmpxchg( ; GFX1250-LABEL: private_agent_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -5093,6 +6741,7 @@ define amdgpu_kernel void @private_agent_acquire_acquire_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -5115,6 +6764,12 @@ define amdgpu_kernel void @private_agent_release_acquire_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -5135,6 +6790,12 @@ define amdgpu_kernel void @private_agent_release_acquire_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -5155,6 +6816,12 @@ define amdgpu_kernel void @private_agent_release_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5172,6 +6839,12 @@ define amdgpu_kernel void @private_agent_release_acquire_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5193,6 +6866,12 @@ define amdgpu_kernel void @private_agent_release_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -5213,6 +6892,12 @@ define amdgpu_kernel void @private_agent_release_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5231,6 +6916,12 @@ define amdgpu_kernel void @private_agent_release_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5247,6 +6938,12 @@ define amdgpu_kernel void @private_agent_release_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_agent_release_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -5264,6 +6961,12 @@ define amdgpu_kernel void @private_agent_release_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_agent_release_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -5281,6 +6984,12 @@ define amdgpu_kernel void @private_agent_release_acquire_cmpxchg( ; GFX11-WGP-LABEL: private_agent_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -5296,6 +7005,12 @@ define amdgpu_kernel void @private_agent_release_acquire_cmpxchg( ; GFX11-CU-LABEL: private_agent_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -5311,6 +7026,12 @@ define amdgpu_kernel void @private_agent_release_acquire_cmpxchg( ; GFX12-WGP-LABEL: private_agent_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -5325,6 +7046,12 @@ define amdgpu_kernel void @private_agent_release_acquire_cmpxchg( ; GFX12-CU-LABEL: private_agent_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -5339,26 +7066,32 @@ define amdgpu_kernel void @private_agent_release_acquire_cmpxchg( ; GFX1250-LABEL: private_agent_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -5366,6 +7099,7 @@ define amdgpu_kernel void @private_agent_release_acquire_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -5393,6 +7127,12 @@ define amdgpu_kernel void @private_agent_acq_rel_acquire_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -5413,6 +7153,12 @@ define amdgpu_kernel void @private_agent_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -5433,6 +7179,12 @@ define amdgpu_kernel void @private_agent_acq_rel_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5450,6 +7202,12 @@ define amdgpu_kernel void @private_agent_acq_rel_acquire_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5471,6 +7229,12 @@ define amdgpu_kernel void @private_agent_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -5491,6 +7255,12 @@ define amdgpu_kernel void @private_agent_acq_rel_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5509,6 +7279,12 @@ define amdgpu_kernel void @private_agent_acq_rel_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5525,6 +7301,12 @@ define amdgpu_kernel void @private_agent_acq_rel_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_agent_acq_rel_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -5542,6 +7324,12 @@ define amdgpu_kernel void @private_agent_acq_rel_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_agent_acq_rel_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -5559,6 +7347,12 @@ define amdgpu_kernel void @private_agent_acq_rel_acquire_cmpxchg( ; GFX11-WGP-LABEL: private_agent_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -5574,6 +7368,12 @@ define amdgpu_kernel void @private_agent_acq_rel_acquire_cmpxchg( ; GFX11-CU-LABEL: private_agent_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -5589,6 +7389,12 @@ define amdgpu_kernel void @private_agent_acq_rel_acquire_cmpxchg( ; GFX12-WGP-LABEL: private_agent_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -5603,6 +7409,12 @@ define amdgpu_kernel void @private_agent_acq_rel_acquire_cmpxchg( ; GFX12-CU-LABEL: private_agent_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -5617,26 +7429,32 @@ define amdgpu_kernel void @private_agent_acq_rel_acquire_cmpxchg( ; GFX1250-LABEL: private_agent_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -5644,6 +7462,7 @@ define amdgpu_kernel void @private_agent_acq_rel_acquire_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -5671,6 +7490,12 @@ define amdgpu_kernel void @private_agent_seq_cst_acquire_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -5691,6 +7516,12 @@ define amdgpu_kernel void @private_agent_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -5711,6 +7542,12 @@ define amdgpu_kernel void @private_agent_seq_cst_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5728,6 +7565,12 @@ define amdgpu_kernel void @private_agent_seq_cst_acquire_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5749,6 +7592,12 @@ define amdgpu_kernel void @private_agent_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -5769,6 +7618,12 @@ define amdgpu_kernel void @private_agent_seq_cst_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5787,6 +7642,12 @@ define amdgpu_kernel void @private_agent_seq_cst_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5803,6 +7664,12 @@ define amdgpu_kernel void @private_agent_seq_cst_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_agent_seq_cst_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -5820,6 +7687,12 @@ define amdgpu_kernel void @private_agent_seq_cst_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_agent_seq_cst_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -5837,6 +7710,12 @@ define amdgpu_kernel void @private_agent_seq_cst_acquire_cmpxchg( ; GFX11-WGP-LABEL: private_agent_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -5852,6 +7731,12 @@ define amdgpu_kernel void @private_agent_seq_cst_acquire_cmpxchg( ; GFX11-CU-LABEL: private_agent_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -5867,6 +7752,12 @@ define amdgpu_kernel void @private_agent_seq_cst_acquire_cmpxchg( ; GFX12-WGP-LABEL: private_agent_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -5881,6 +7772,12 @@ define amdgpu_kernel void @private_agent_seq_cst_acquire_cmpxchg( ; GFX12-CU-LABEL: private_agent_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -5895,26 +7792,32 @@ define amdgpu_kernel void @private_agent_seq_cst_acquire_cmpxchg( ; GFX1250-LABEL: private_agent_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -5922,6 +7825,7 @@ define amdgpu_kernel void @private_agent_seq_cst_acquire_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -5949,6 +7853,12 @@ define amdgpu_kernel void @private_agent_monotonic_seq_cst_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -5969,6 +7879,12 @@ define amdgpu_kernel void @private_agent_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -5989,6 +7905,12 @@ define amdgpu_kernel void @private_agent_monotonic_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -6006,6 +7928,12 @@ define amdgpu_kernel void @private_agent_monotonic_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6027,6 +7955,12 @@ define amdgpu_kernel void @private_agent_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -6047,6 +7981,12 @@ define amdgpu_kernel void @private_agent_monotonic_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6065,6 +8005,12 @@ define amdgpu_kernel void @private_agent_monotonic_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6081,6 +8027,12 @@ define amdgpu_kernel void @private_agent_monotonic_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_agent_monotonic_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -6098,6 +8050,12 @@ define amdgpu_kernel void @private_agent_monotonic_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_agent_monotonic_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -6115,6 +8073,12 @@ define amdgpu_kernel void @private_agent_monotonic_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: private_agent_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -6130,6 +8094,12 @@ define amdgpu_kernel void @private_agent_monotonic_seq_cst_cmpxchg( ; GFX11-CU-LABEL: private_agent_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -6145,6 +8115,12 @@ define amdgpu_kernel void @private_agent_monotonic_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: private_agent_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -6159,6 +8135,12 @@ define amdgpu_kernel void @private_agent_monotonic_seq_cst_cmpxchg( ; GFX12-CU-LABEL: private_agent_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -6173,26 +8155,32 @@ define amdgpu_kernel void @private_agent_monotonic_seq_cst_cmpxchg( ; GFX1250-LABEL: private_agent_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -6200,6 +8188,7 @@ define amdgpu_kernel void @private_agent_monotonic_seq_cst_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -6227,6 +8216,12 @@ define amdgpu_kernel void @private_agent_acquire_seq_cst_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -6247,6 +8242,12 @@ define amdgpu_kernel void @private_agent_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -6267,6 +8268,12 @@ define amdgpu_kernel void @private_agent_acquire_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -6284,6 +8291,12 @@ define amdgpu_kernel void @private_agent_acquire_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6305,6 +8318,12 @@ define amdgpu_kernel void @private_agent_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -6325,6 +8344,12 @@ define amdgpu_kernel void @private_agent_acquire_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6343,6 +8368,12 @@ define amdgpu_kernel void @private_agent_acquire_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6359,6 +8390,12 @@ define amdgpu_kernel void @private_agent_acquire_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_agent_acquire_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -6376,6 +8413,12 @@ define amdgpu_kernel void @private_agent_acquire_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_agent_acquire_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -6393,6 +8436,12 @@ define amdgpu_kernel void @private_agent_acquire_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: private_agent_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -6408,6 +8457,12 @@ define amdgpu_kernel void @private_agent_acquire_seq_cst_cmpxchg( ; GFX11-CU-LABEL: private_agent_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -6423,6 +8478,12 @@ define amdgpu_kernel void @private_agent_acquire_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: private_agent_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -6437,6 +8498,12 @@ define amdgpu_kernel void @private_agent_acquire_seq_cst_cmpxchg( ; GFX12-CU-LABEL: private_agent_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -6451,26 +8518,32 @@ define amdgpu_kernel void @private_agent_acquire_seq_cst_cmpxchg( ; GFX1250-LABEL: private_agent_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -6478,6 +8551,7 @@ define amdgpu_kernel void @private_agent_acquire_seq_cst_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -6505,6 +8579,12 @@ define amdgpu_kernel void @private_agent_release_seq_cst_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -6525,6 +8605,12 @@ define amdgpu_kernel void @private_agent_release_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -6545,6 +8631,12 @@ define amdgpu_kernel void @private_agent_release_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -6562,6 +8654,12 @@ define amdgpu_kernel void @private_agent_release_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6583,6 +8681,12 @@ define amdgpu_kernel void @private_agent_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -6603,6 +8707,12 @@ define amdgpu_kernel void @private_agent_release_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6621,6 +8731,12 @@ define amdgpu_kernel void @private_agent_release_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6637,6 +8753,12 @@ define amdgpu_kernel void @private_agent_release_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_agent_release_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -6654,6 +8776,12 @@ define amdgpu_kernel void @private_agent_release_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_agent_release_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -6671,6 +8799,12 @@ define amdgpu_kernel void @private_agent_release_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: private_agent_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -6686,6 +8820,12 @@ define amdgpu_kernel void @private_agent_release_seq_cst_cmpxchg( ; GFX11-CU-LABEL: private_agent_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -6701,6 +8841,12 @@ define amdgpu_kernel void @private_agent_release_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: private_agent_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -6715,6 +8861,12 @@ define amdgpu_kernel void @private_agent_release_seq_cst_cmpxchg( ; GFX12-CU-LABEL: private_agent_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -6729,26 +8881,32 @@ define amdgpu_kernel void @private_agent_release_seq_cst_cmpxchg( ; GFX1250-LABEL: private_agent_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -6756,6 +8914,7 @@ define amdgpu_kernel void @private_agent_release_seq_cst_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -6783,6 +8942,12 @@ define amdgpu_kernel void @private_agent_acq_rel_seq_cst_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -6803,6 +8968,12 @@ define amdgpu_kernel void @private_agent_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -6823,6 +8994,12 @@ define amdgpu_kernel void @private_agent_acq_rel_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -6840,6 +9017,12 @@ define amdgpu_kernel void @private_agent_acq_rel_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6861,6 +9044,12 @@ define amdgpu_kernel void @private_agent_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -6881,6 +9070,12 @@ define amdgpu_kernel void @private_agent_acq_rel_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6899,6 +9094,12 @@ define amdgpu_kernel void @private_agent_acq_rel_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6915,6 +9116,12 @@ define amdgpu_kernel void @private_agent_acq_rel_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_agent_acq_rel_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -6932,6 +9139,12 @@ define amdgpu_kernel void @private_agent_acq_rel_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_agent_acq_rel_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -6949,6 +9162,12 @@ define amdgpu_kernel void @private_agent_acq_rel_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: private_agent_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -6964,6 +9183,12 @@ define amdgpu_kernel void @private_agent_acq_rel_seq_cst_cmpxchg( ; GFX11-CU-LABEL: private_agent_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -6979,6 +9204,12 @@ define amdgpu_kernel void @private_agent_acq_rel_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: private_agent_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -6993,6 +9224,12 @@ define amdgpu_kernel void @private_agent_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-LABEL: private_agent_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -7007,26 +9244,32 @@ define amdgpu_kernel void @private_agent_acq_rel_seq_cst_cmpxchg( ; GFX1250-LABEL: private_agent_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -7034,6 +9277,7 @@ define amdgpu_kernel void @private_agent_acq_rel_seq_cst_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -7061,6 +9305,12 @@ define amdgpu_kernel void @private_agent_seq_cst_seq_cst_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -7081,6 +9331,12 @@ define amdgpu_kernel void @private_agent_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -7101,6 +9357,12 @@ define amdgpu_kernel void @private_agent_seq_cst_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7118,6 +9380,12 @@ define amdgpu_kernel void @private_agent_seq_cst_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -7139,6 +9407,12 @@ define amdgpu_kernel void @private_agent_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -7159,6 +9433,12 @@ define amdgpu_kernel void @private_agent_seq_cst_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7177,6 +9457,12 @@ define amdgpu_kernel void @private_agent_seq_cst_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7193,6 +9479,12 @@ define amdgpu_kernel void @private_agent_seq_cst_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_agent_seq_cst_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -7210,6 +9502,12 @@ define amdgpu_kernel void @private_agent_seq_cst_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_agent_seq_cst_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -7227,6 +9525,12 @@ define amdgpu_kernel void @private_agent_seq_cst_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: private_agent_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -7242,6 +9546,12 @@ define amdgpu_kernel void @private_agent_seq_cst_seq_cst_cmpxchg( ; GFX11-CU-LABEL: private_agent_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -7257,6 +9567,12 @@ define amdgpu_kernel void @private_agent_seq_cst_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: private_agent_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -7271,6 +9587,12 @@ define amdgpu_kernel void @private_agent_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-LABEL: private_agent_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -7285,26 +9607,32 @@ define amdgpu_kernel void @private_agent_seq_cst_seq_cst_cmpxchg( ; GFX1250-LABEL: private_agent_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -7312,6 +9640,7 @@ define amdgpu_kernel void @private_agent_seq_cst_seq_cst_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -7340,6 +9669,12 @@ define amdgpu_kernel void @private_agent_monotonic_monotonic_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -7364,6 +9699,12 @@ define amdgpu_kernel void @private_agent_monotonic_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -7386,6 +9727,12 @@ define amdgpu_kernel void @private_agent_monotonic_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7405,6 +9752,12 @@ define amdgpu_kernel void @private_agent_monotonic_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -7429,6 +9782,12 @@ define amdgpu_kernel void @private_agent_monotonic_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -7451,6 +9810,12 @@ define amdgpu_kernel void @private_agent_monotonic_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7471,6 +9836,12 @@ define amdgpu_kernel void @private_agent_monotonic_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7490,6 +9861,12 @@ define amdgpu_kernel void @private_agent_monotonic_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -7510,6 +9887,12 @@ define amdgpu_kernel void @private_agent_monotonic_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -7529,6 +9912,12 @@ define amdgpu_kernel void @private_agent_monotonic_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: private_agent_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -7545,6 +9934,12 @@ define amdgpu_kernel void @private_agent_monotonic_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: private_agent_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -7561,6 +9956,12 @@ define amdgpu_kernel void @private_agent_monotonic_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: private_agent_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -7576,6 +9977,12 @@ define amdgpu_kernel void @private_agent_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: private_agent_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -7592,25 +9999,31 @@ define amdgpu_kernel void @private_agent_monotonic_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -7642,6 +10055,12 @@ define amdgpu_kernel void @private_agent_acquire_monotonic_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -7666,6 +10085,12 @@ define amdgpu_kernel void @private_agent_acquire_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -7688,6 +10113,12 @@ define amdgpu_kernel void @private_agent_acquire_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7707,6 +10138,12 @@ define amdgpu_kernel void @private_agent_acquire_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -7731,6 +10168,12 @@ define amdgpu_kernel void @private_agent_acquire_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -7753,6 +10196,12 @@ define amdgpu_kernel void @private_agent_acquire_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7773,6 +10222,12 @@ define amdgpu_kernel void @private_agent_acquire_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7792,6 +10247,12 @@ define amdgpu_kernel void @private_agent_acquire_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -7812,6 +10273,12 @@ define amdgpu_kernel void @private_agent_acquire_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -7831,6 +10298,12 @@ define amdgpu_kernel void @private_agent_acquire_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: private_agent_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -7847,6 +10320,12 @@ define amdgpu_kernel void @private_agent_acquire_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: private_agent_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -7863,6 +10342,12 @@ define amdgpu_kernel void @private_agent_acquire_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: private_agent_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -7878,6 +10363,12 @@ define amdgpu_kernel void @private_agent_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: private_agent_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -7894,25 +10385,31 @@ define amdgpu_kernel void @private_agent_acquire_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -7946,6 +10443,12 @@ define amdgpu_kernel void @private_agent_release_monotonic_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -7970,6 +10473,12 @@ define amdgpu_kernel void @private_agent_release_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -7992,6 +10501,12 @@ define amdgpu_kernel void @private_agent_release_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -8011,6 +10526,12 @@ define amdgpu_kernel void @private_agent_release_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -8035,6 +10556,12 @@ define amdgpu_kernel void @private_agent_release_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -8057,6 +10584,12 @@ define amdgpu_kernel void @private_agent_release_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8077,6 +10610,12 @@ define amdgpu_kernel void @private_agent_release_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8096,6 +10635,12 @@ define amdgpu_kernel void @private_agent_release_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -8116,6 +10661,12 @@ define amdgpu_kernel void @private_agent_release_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -8135,6 +10686,12 @@ define amdgpu_kernel void @private_agent_release_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: private_agent_release_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -8151,6 +10708,12 @@ define amdgpu_kernel void @private_agent_release_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: private_agent_release_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -8167,6 +10730,12 @@ define amdgpu_kernel void @private_agent_release_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: private_agent_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -8182,6 +10751,12 @@ define amdgpu_kernel void @private_agent_release_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: private_agent_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -8198,25 +10773,31 @@ define amdgpu_kernel void @private_agent_release_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -8253,6 +10834,12 @@ define amdgpu_kernel void @private_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -8277,6 +10864,12 @@ define amdgpu_kernel void @private_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -8299,6 +10892,12 @@ define amdgpu_kernel void @private_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -8318,6 +10917,12 @@ define amdgpu_kernel void @private_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -8342,6 +10947,12 @@ define amdgpu_kernel void @private_agent_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -8364,6 +10975,12 @@ define amdgpu_kernel void @private_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8384,6 +11001,12 @@ define amdgpu_kernel void @private_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8403,6 +11026,12 @@ define amdgpu_kernel void @private_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -8423,6 +11052,12 @@ define amdgpu_kernel void @private_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -8442,6 +11077,12 @@ define amdgpu_kernel void @private_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: private_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -8458,6 +11099,12 @@ define amdgpu_kernel void @private_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: private_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -8474,6 +11121,12 @@ define amdgpu_kernel void @private_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: private_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -8489,6 +11142,12 @@ define amdgpu_kernel void @private_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: private_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -8505,25 +11164,31 @@ define amdgpu_kernel void @private_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -8562,6 +11227,12 @@ define amdgpu_kernel void @private_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -8586,6 +11257,12 @@ define amdgpu_kernel void @private_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -8608,6 +11285,12 @@ define amdgpu_kernel void @private_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -8627,6 +11310,12 @@ define amdgpu_kernel void @private_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -8651,6 +11340,12 @@ define amdgpu_kernel void @private_agent_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -8673,6 +11368,12 @@ define amdgpu_kernel void @private_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8693,6 +11394,12 @@ define amdgpu_kernel void @private_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8712,6 +11419,12 @@ define amdgpu_kernel void @private_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -8732,6 +11445,12 @@ define amdgpu_kernel void @private_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -8751,6 +11470,12 @@ define amdgpu_kernel void @private_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: private_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -8767,6 +11492,12 @@ define amdgpu_kernel void @private_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: private_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -8783,6 +11514,12 @@ define amdgpu_kernel void @private_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: private_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -8798,6 +11535,12 @@ define amdgpu_kernel void @private_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: private_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -8814,25 +11557,31 @@ define amdgpu_kernel void @private_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -8871,6 +11620,12 @@ define amdgpu_kernel void @private_agent_monotonic_acquire_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -8895,6 +11650,12 @@ define amdgpu_kernel void @private_agent_monotonic_acquire_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -8917,6 +11678,12 @@ define amdgpu_kernel void @private_agent_monotonic_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -8936,6 +11703,12 @@ define amdgpu_kernel void @private_agent_monotonic_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -8960,6 +11733,12 @@ define amdgpu_kernel void @private_agent_monotonic_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -8982,6 +11761,12 @@ define amdgpu_kernel void @private_agent_monotonic_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9002,6 +11787,12 @@ define amdgpu_kernel void @private_agent_monotonic_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9021,6 +11812,12 @@ define amdgpu_kernel void @private_agent_monotonic_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -9041,6 +11838,12 @@ define amdgpu_kernel void @private_agent_monotonic_acquire_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -9060,6 +11863,12 @@ define amdgpu_kernel void @private_agent_monotonic_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: private_agent_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -9076,6 +11885,12 @@ define amdgpu_kernel void @private_agent_monotonic_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: private_agent_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -9092,6 +11907,12 @@ define amdgpu_kernel void @private_agent_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: private_agent_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -9107,6 +11928,12 @@ define amdgpu_kernel void @private_agent_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: private_agent_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -9123,25 +11950,31 @@ define amdgpu_kernel void @private_agent_monotonic_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -9175,6 +12008,12 @@ define amdgpu_kernel void @private_agent_acquire_acquire_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -9199,6 +12038,12 @@ define amdgpu_kernel void @private_agent_acquire_acquire_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -9221,6 +12066,12 @@ define amdgpu_kernel void @private_agent_acquire_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -9240,6 +12091,12 @@ define amdgpu_kernel void @private_agent_acquire_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -9264,6 +12121,12 @@ define amdgpu_kernel void @private_agent_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -9286,6 +12149,12 @@ define amdgpu_kernel void @private_agent_acquire_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9306,6 +12175,12 @@ define amdgpu_kernel void @private_agent_acquire_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9325,6 +12200,12 @@ define amdgpu_kernel void @private_agent_acquire_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -9345,6 +12226,12 @@ define amdgpu_kernel void @private_agent_acquire_acquire_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -9364,6 +12251,12 @@ define amdgpu_kernel void @private_agent_acquire_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: private_agent_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -9380,6 +12273,12 @@ define amdgpu_kernel void @private_agent_acquire_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: private_agent_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -9396,6 +12295,12 @@ define amdgpu_kernel void @private_agent_acquire_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: private_agent_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -9411,6 +12316,12 @@ define amdgpu_kernel void @private_agent_acquire_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: private_agent_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -9427,25 +12338,31 @@ define amdgpu_kernel void @private_agent_acquire_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -9479,6 +12396,12 @@ define amdgpu_kernel void @private_agent_release_acquire_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -9503,6 +12426,12 @@ define amdgpu_kernel void @private_agent_release_acquire_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -9525,6 +12454,12 @@ define amdgpu_kernel void @private_agent_release_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -9544,6 +12479,12 @@ define amdgpu_kernel void @private_agent_release_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -9568,6 +12509,12 @@ define amdgpu_kernel void @private_agent_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -9590,6 +12537,12 @@ define amdgpu_kernel void @private_agent_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9610,6 +12563,12 @@ define amdgpu_kernel void @private_agent_release_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9629,6 +12588,12 @@ define amdgpu_kernel void @private_agent_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -9649,6 +12614,12 @@ define amdgpu_kernel void @private_agent_release_acquire_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -9668,6 +12639,12 @@ define amdgpu_kernel void @private_agent_release_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: private_agent_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -9684,6 +12661,12 @@ define amdgpu_kernel void @private_agent_release_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: private_agent_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -9700,6 +12683,12 @@ define amdgpu_kernel void @private_agent_release_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: private_agent_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -9715,6 +12704,12 @@ define amdgpu_kernel void @private_agent_release_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: private_agent_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -9731,25 +12726,31 @@ define amdgpu_kernel void @private_agent_release_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -9788,6 +12789,12 @@ define amdgpu_kernel void @private_agent_acq_rel_acquire_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -9812,6 +12819,12 @@ define amdgpu_kernel void @private_agent_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -9834,6 +12847,12 @@ define amdgpu_kernel void @private_agent_acq_rel_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -9853,6 +12872,12 @@ define amdgpu_kernel void @private_agent_acq_rel_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -9877,6 +12902,12 @@ define amdgpu_kernel void @private_agent_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -9899,6 +12930,12 @@ define amdgpu_kernel void @private_agent_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9919,6 +12956,12 @@ define amdgpu_kernel void @private_agent_acq_rel_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9938,6 +12981,12 @@ define amdgpu_kernel void @private_agent_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -9958,6 +13007,12 @@ define amdgpu_kernel void @private_agent_acq_rel_acquire_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -9977,6 +13032,12 @@ define amdgpu_kernel void @private_agent_acq_rel_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: private_agent_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -9993,6 +13054,12 @@ define amdgpu_kernel void @private_agent_acq_rel_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: private_agent_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -10009,6 +13076,12 @@ define amdgpu_kernel void @private_agent_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: private_agent_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -10024,6 +13097,12 @@ define amdgpu_kernel void @private_agent_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: private_agent_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -10040,25 +13119,31 @@ define amdgpu_kernel void @private_agent_acq_rel_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -10097,6 +13182,12 @@ define amdgpu_kernel void @private_agent_seq_cst_acquire_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -10121,6 +13212,12 @@ define amdgpu_kernel void @private_agent_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -10143,6 +13240,12 @@ define amdgpu_kernel void @private_agent_seq_cst_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10162,6 +13265,12 @@ define amdgpu_kernel void @private_agent_seq_cst_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10186,6 +13295,12 @@ define amdgpu_kernel void @private_agent_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -10208,6 +13323,12 @@ define amdgpu_kernel void @private_agent_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10228,6 +13349,12 @@ define amdgpu_kernel void @private_agent_seq_cst_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10247,6 +13374,12 @@ define amdgpu_kernel void @private_agent_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -10267,6 +13400,12 @@ define amdgpu_kernel void @private_agent_seq_cst_acquire_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -10286,6 +13425,12 @@ define amdgpu_kernel void @private_agent_seq_cst_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: private_agent_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -10302,6 +13447,12 @@ define amdgpu_kernel void @private_agent_seq_cst_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: private_agent_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -10318,6 +13469,12 @@ define amdgpu_kernel void @private_agent_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: private_agent_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -10333,6 +13490,12 @@ define amdgpu_kernel void @private_agent_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: private_agent_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -10349,25 +13512,31 @@ define amdgpu_kernel void @private_agent_seq_cst_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -10406,6 +13575,12 @@ define amdgpu_kernel void @private_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -10430,6 +13605,12 @@ define amdgpu_kernel void @private_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -10452,6 +13633,12 @@ define amdgpu_kernel void @private_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10471,6 +13658,12 @@ define amdgpu_kernel void @private_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10495,6 +13688,12 @@ define amdgpu_kernel void @private_agent_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -10517,6 +13716,12 @@ define amdgpu_kernel void @private_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10537,6 +13742,12 @@ define amdgpu_kernel void @private_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10556,6 +13767,12 @@ define amdgpu_kernel void @private_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -10576,6 +13793,12 @@ define amdgpu_kernel void @private_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -10595,6 +13818,12 @@ define amdgpu_kernel void @private_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: private_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -10611,6 +13840,12 @@ define amdgpu_kernel void @private_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: private_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -10627,6 +13862,12 @@ define amdgpu_kernel void @private_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: private_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -10642,6 +13883,12 @@ define amdgpu_kernel void @private_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: private_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -10658,25 +13905,31 @@ define amdgpu_kernel void @private_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -10715,6 +13968,12 @@ define amdgpu_kernel void @private_agent_acquire_seq_cst_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -10739,6 +13998,12 @@ define amdgpu_kernel void @private_agent_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -10761,6 +14026,12 @@ define amdgpu_kernel void @private_agent_acquire_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10780,6 +14051,12 @@ define amdgpu_kernel void @private_agent_acquire_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10804,6 +14081,12 @@ define amdgpu_kernel void @private_agent_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -10826,6 +14109,12 @@ define amdgpu_kernel void @private_agent_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10846,6 +14135,12 @@ define amdgpu_kernel void @private_agent_acquire_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10865,6 +14160,12 @@ define amdgpu_kernel void @private_agent_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -10885,6 +14186,12 @@ define amdgpu_kernel void @private_agent_acquire_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -10904,6 +14211,12 @@ define amdgpu_kernel void @private_agent_acquire_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: private_agent_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -10920,6 +14233,12 @@ define amdgpu_kernel void @private_agent_acquire_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: private_agent_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -10936,6 +14255,12 @@ define amdgpu_kernel void @private_agent_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: private_agent_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -10951,6 +14276,12 @@ define amdgpu_kernel void @private_agent_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: private_agent_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -10967,25 +14298,31 @@ define amdgpu_kernel void @private_agent_acquire_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -11024,6 +14361,12 @@ define amdgpu_kernel void @private_agent_release_seq_cst_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -11048,6 +14391,12 @@ define amdgpu_kernel void @private_agent_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -11070,6 +14419,12 @@ define amdgpu_kernel void @private_agent_release_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11089,6 +14444,12 @@ define amdgpu_kernel void @private_agent_release_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11113,6 +14474,12 @@ define amdgpu_kernel void @private_agent_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -11135,6 +14502,12 @@ define amdgpu_kernel void @private_agent_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11155,6 +14528,12 @@ define amdgpu_kernel void @private_agent_release_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11174,6 +14553,12 @@ define amdgpu_kernel void @private_agent_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -11194,6 +14579,12 @@ define amdgpu_kernel void @private_agent_release_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -11213,6 +14604,12 @@ define amdgpu_kernel void @private_agent_release_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: private_agent_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -11229,6 +14626,12 @@ define amdgpu_kernel void @private_agent_release_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: private_agent_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -11245,6 +14648,12 @@ define amdgpu_kernel void @private_agent_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: private_agent_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -11260,6 +14669,12 @@ define amdgpu_kernel void @private_agent_release_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: private_agent_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -11276,25 +14691,31 @@ define amdgpu_kernel void @private_agent_release_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -11333,6 +14754,12 @@ define amdgpu_kernel void @private_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -11357,6 +14784,12 @@ define amdgpu_kernel void @private_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -11379,6 +14812,12 @@ define amdgpu_kernel void @private_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11398,6 +14837,12 @@ define amdgpu_kernel void @private_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11422,6 +14867,12 @@ define amdgpu_kernel void @private_agent_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -11444,6 +14895,12 @@ define amdgpu_kernel void @private_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11464,6 +14921,12 @@ define amdgpu_kernel void @private_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11483,6 +14946,12 @@ define amdgpu_kernel void @private_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -11503,6 +14972,12 @@ define amdgpu_kernel void @private_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -11522,6 +14997,12 @@ define amdgpu_kernel void @private_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: private_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -11538,6 +15019,12 @@ define amdgpu_kernel void @private_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: private_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -11554,6 +15041,12 @@ define amdgpu_kernel void @private_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: private_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -11569,6 +15062,12 @@ define amdgpu_kernel void @private_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: private_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -11585,25 +15084,31 @@ define amdgpu_kernel void @private_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -11642,6 +15147,12 @@ define amdgpu_kernel void @private_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -11666,6 +15177,12 @@ define amdgpu_kernel void @private_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -11688,6 +15205,12 @@ define amdgpu_kernel void @private_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11707,6 +15230,12 @@ define amdgpu_kernel void @private_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11731,6 +15260,12 @@ define amdgpu_kernel void @private_agent_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -11753,6 +15288,12 @@ define amdgpu_kernel void @private_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11773,6 +15314,12 @@ define amdgpu_kernel void @private_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11792,6 +15339,12 @@ define amdgpu_kernel void @private_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -11812,6 +15365,12 @@ define amdgpu_kernel void @private_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -11831,6 +15390,12 @@ define amdgpu_kernel void @private_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: private_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -11847,6 +15412,12 @@ define amdgpu_kernel void @private_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: private_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -11863,6 +15434,12 @@ define amdgpu_kernel void @private_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: private_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -11878,6 +15455,12 @@ define amdgpu_kernel void @private_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: private_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -11894,25 +15477,31 @@ define amdgpu_kernel void @private_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -11949,11 +15538,15 @@ define amdgpu_kernel void @private_agent_one_as_unordered_load( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -11963,11 +15556,15 @@ define amdgpu_kernel void @private_agent_one_as_unordered_load( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -11977,11 +15574,15 @@ define amdgpu_kernel void @private_agent_one_as_unordered_load( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -11991,11 +15592,15 @@ define amdgpu_kernel void @private_agent_one_as_unordered_load( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12009,11 +15614,15 @@ define amdgpu_kernel void @private_agent_one_as_unordered_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -12023,11 +15632,15 @@ define amdgpu_kernel void @private_agent_one_as_unordered_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12037,11 +15650,15 @@ define amdgpu_kernel void @private_agent_one_as_unordered_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12049,84 +15666,108 @@ define amdgpu_kernel void @private_agent_one_as_unordered_load( ; ; GFX942-NOTTGSPLIT-LABEL: private_agent_one_as_unordered_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_agent_one_as_unordered_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_agent_one_as_unordered_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_agent_one_as_unordered_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_agent_one_as_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_agent_one_as_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: private_agent_one_as_unordered_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -12135,6 +15776,7 @@ define amdgpu_kernel void @private_agent_one_as_unordered_load( ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: flat_load_b32 v0, v[0:1] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %in, ptr addrspace(5) %out) { @@ -12149,11 +15791,15 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_load( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12163,11 +15809,15 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_load( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12177,11 +15827,15 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_load( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12191,11 +15845,15 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_load( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12209,11 +15867,15 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -12223,11 +15885,15 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12237,11 +15903,15 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12249,84 +15919,108 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_load( ; ; GFX942-NOTTGSPLIT-LABEL: private_agent_one_as_monotonic_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_agent_one_as_monotonic_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_agent_one_as_monotonic_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_agent_one_as_monotonic_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_agent_one_as_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_agent_one_as_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: private_agent_one_as_monotonic_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -12335,6 +16029,7 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_load( ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %in, ptr addrspace(5) %out) { @@ -12349,11 +16044,15 @@ define amdgpu_kernel void @private_agent_one_as_acquire_load( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12363,11 +16062,15 @@ define amdgpu_kernel void @private_agent_one_as_acquire_load( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12377,11 +16080,15 @@ define amdgpu_kernel void @private_agent_one_as_acquire_load( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12391,11 +16098,15 @@ define amdgpu_kernel void @private_agent_one_as_acquire_load( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12409,11 +16120,15 @@ define amdgpu_kernel void @private_agent_one_as_acquire_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -12423,11 +16138,15 @@ define amdgpu_kernel void @private_agent_one_as_acquire_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12437,11 +16156,15 @@ define amdgpu_kernel void @private_agent_one_as_acquire_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12449,84 +16172,108 @@ define amdgpu_kernel void @private_agent_one_as_acquire_load( ; ; GFX942-NOTTGSPLIT-LABEL: private_agent_one_as_acquire_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_agent_one_as_acquire_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_agent_one_as_acquire_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_agent_one_as_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_agent_one_as_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_agent_one_as_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: private_agent_one_as_acquire_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -12537,6 +16284,7 @@ define amdgpu_kernel void @private_agent_one_as_acquire_load( ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %in, ptr addrspace(5) %out) { @@ -12551,11 +16299,15 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_load( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12565,11 +16317,15 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_load( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12579,11 +16335,15 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_load( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12593,11 +16353,15 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_load( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12611,11 +16375,15 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -12625,11 +16393,15 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12639,11 +16411,15 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12651,84 +16427,108 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_load( ; ; GFX942-NOTTGSPLIT-LABEL: private_agent_one_as_seq_cst_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_agent_one_as_seq_cst_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_agent_one_as_seq_cst_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_agent_one_as_seq_cst_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_agent_one_as_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_agent_one_as_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: private_agent_one_as_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -12741,6 +16541,7 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_load( ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %in, ptr addrspace(5) %out) { @@ -12755,10 +16556,14 @@ define amdgpu_kernel void @private_agent_one_as_unordered_store( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX6-NEXT: s_endpgm @@ -12767,10 +16572,14 @@ define amdgpu_kernel void @private_agent_one_as_unordered_store( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX7-NEXT: s_endpgm @@ -12779,10 +16588,14 @@ define amdgpu_kernel void @private_agent_one_as_unordered_store( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-WGP-NEXT: s_endpgm @@ -12791,10 +16604,14 @@ define amdgpu_kernel void @private_agent_one_as_unordered_store( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-CU-NEXT: s_endpgm @@ -12807,10 +16624,14 @@ define amdgpu_kernel void @private_agent_one_as_unordered_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -12819,10 +16640,14 @@ define amdgpu_kernel void @private_agent_one_as_unordered_store( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -12831,65 +16656,93 @@ define amdgpu_kernel void @private_agent_one_as_unordered_store( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: private_agent_one_as_unordered_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_agent_one_as_unordered_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_agent_one_as_unordered_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_agent_one_as_unordered_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_agent_one_as_unordered_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_agent_one_as_unordered_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; @@ -12897,22 +16750,26 @@ define amdgpu_kernel void @private_agent_one_as_unordered_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -12934,10 +16791,14 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_store( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX6-NEXT: s_endpgm @@ -12946,10 +16807,14 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_store( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX7-NEXT: s_endpgm @@ -12958,10 +16823,14 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_store( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-WGP-NEXT: s_endpgm @@ -12970,10 +16839,14 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_store( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-CU-NEXT: s_endpgm @@ -12986,10 +16859,14 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -12998,10 +16875,14 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_store( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -13010,65 +16891,93 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_store( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: private_agent_one_as_monotonic_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_agent_one_as_monotonic_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_agent_one_as_monotonic_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_agent_one_as_monotonic_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_agent_one_as_monotonic_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_agent_one_as_monotonic_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; @@ -13076,22 +16985,26 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -13113,10 +17026,14 @@ define amdgpu_kernel void @private_agent_one_as_release_store( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX6-NEXT: s_endpgm @@ -13125,10 +17042,14 @@ define amdgpu_kernel void @private_agent_one_as_release_store( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX7-NEXT: s_endpgm @@ -13137,10 +17058,14 @@ define amdgpu_kernel void @private_agent_one_as_release_store( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-WGP-NEXT: s_endpgm @@ -13149,10 +17074,14 @@ define amdgpu_kernel void @private_agent_one_as_release_store( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-CU-NEXT: s_endpgm @@ -13165,10 +17094,14 @@ define amdgpu_kernel void @private_agent_one_as_release_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -13177,10 +17110,14 @@ define amdgpu_kernel void @private_agent_one_as_release_store( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -13189,65 +17126,93 @@ define amdgpu_kernel void @private_agent_one_as_release_store( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: private_agent_one_as_release_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_agent_one_as_release_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_agent_one_as_release_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_agent_one_as_release_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_agent_one_as_release_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_agent_one_as_release_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; @@ -13255,22 +17220,26 @@ define amdgpu_kernel void @private_agent_one_as_release_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -13297,10 +17266,14 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_store( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX6-NEXT: s_endpgm @@ -13309,10 +17282,14 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_store( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX7-NEXT: s_endpgm @@ -13321,10 +17298,14 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_store( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-WGP-NEXT: s_endpgm @@ -13333,10 +17314,14 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_store( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-CU-NEXT: s_endpgm @@ -13349,10 +17334,14 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -13361,10 +17350,14 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_store( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -13373,65 +17366,93 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_store( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: private_agent_one_as_seq_cst_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_agent_one_as_seq_cst_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_agent_one_as_seq_cst_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_agent_one_as_seq_cst_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_agent_one_as_seq_cst_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_agent_one_as_seq_cst_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; @@ -13439,22 +17460,26 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -13482,8 +17507,15 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13494,8 +17526,15 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13506,8 +17545,15 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13518,8 +17564,15 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13534,8 +17587,15 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -13546,8 +17606,15 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13558,8 +17625,15 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13568,8 +17642,14 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_agent_one_as_monotonic_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -13577,8 +17657,14 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_agent_one_as_monotonic_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm @@ -13586,8 +17672,14 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_atomicrmw( ; GFX11-WGP-LABEL: private_agent_one_as_monotonic_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm @@ -13595,8 +17687,14 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_atomicrmw( ; GFX11-CU-LABEL: private_agent_one_as_monotonic_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm @@ -13604,8 +17702,14 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_atomicrmw( ; GFX12-WGP-LABEL: private_agent_one_as_monotonic_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm @@ -13613,8 +17717,14 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_atomicrmw( ; GFX12-CU-LABEL: private_agent_one_as_monotonic_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm @@ -13622,29 +17732,34 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_atomicrmw( ; GFX1250-LABEL: private_agent_one_as_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 ; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV @@ -13661,8 +17776,15 @@ define amdgpu_kernel void @private_agent_one_as_acquire_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13673,8 +17795,15 @@ define amdgpu_kernel void @private_agent_one_as_acquire_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13685,8 +17814,15 @@ define amdgpu_kernel void @private_agent_one_as_acquire_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13697,8 +17833,15 @@ define amdgpu_kernel void @private_agent_one_as_acquire_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13713,8 +17856,15 @@ define amdgpu_kernel void @private_agent_one_as_acquire_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -13725,8 +17875,15 @@ define amdgpu_kernel void @private_agent_one_as_acquire_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13737,8 +17894,15 @@ define amdgpu_kernel void @private_agent_one_as_acquire_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13747,8 +17911,14 @@ define amdgpu_kernel void @private_agent_one_as_acquire_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_agent_one_as_acquire_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -13756,8 +17926,14 @@ define amdgpu_kernel void @private_agent_one_as_acquire_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_agent_one_as_acquire_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm @@ -13765,8 +17941,14 @@ define amdgpu_kernel void @private_agent_one_as_acquire_atomicrmw( ; GFX11-WGP-LABEL: private_agent_one_as_acquire_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm @@ -13774,8 +17956,14 @@ define amdgpu_kernel void @private_agent_one_as_acquire_atomicrmw( ; GFX11-CU-LABEL: private_agent_one_as_acquire_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm @@ -13783,8 +17971,14 @@ define amdgpu_kernel void @private_agent_one_as_acquire_atomicrmw( ; GFX12-WGP-LABEL: private_agent_one_as_acquire_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm @@ -13792,8 +17986,14 @@ define amdgpu_kernel void @private_agent_one_as_acquire_atomicrmw( ; GFX12-CU-LABEL: private_agent_one_as_acquire_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm @@ -13801,29 +18001,34 @@ define amdgpu_kernel void @private_agent_one_as_acquire_atomicrmw( ; GFX1250-LABEL: private_agent_one_as_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 ; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV @@ -13843,8 +18048,15 @@ define amdgpu_kernel void @private_agent_one_as_release_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13855,8 +18067,15 @@ define amdgpu_kernel void @private_agent_one_as_release_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13867,8 +18086,15 @@ define amdgpu_kernel void @private_agent_one_as_release_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13879,8 +18105,15 @@ define amdgpu_kernel void @private_agent_one_as_release_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13895,8 +18128,15 @@ define amdgpu_kernel void @private_agent_one_as_release_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -13907,8 +18147,15 @@ define amdgpu_kernel void @private_agent_one_as_release_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13919,8 +18166,15 @@ define amdgpu_kernel void @private_agent_one_as_release_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13929,8 +18183,14 @@ define amdgpu_kernel void @private_agent_one_as_release_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_agent_one_as_release_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -13938,8 +18198,14 @@ define amdgpu_kernel void @private_agent_one_as_release_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_agent_one_as_release_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm @@ -13947,8 +18213,14 @@ define amdgpu_kernel void @private_agent_one_as_release_atomicrmw( ; GFX11-WGP-LABEL: private_agent_one_as_release_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm @@ -13956,8 +18228,14 @@ define amdgpu_kernel void @private_agent_one_as_release_atomicrmw( ; GFX11-CU-LABEL: private_agent_one_as_release_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm @@ -13965,8 +18243,14 @@ define amdgpu_kernel void @private_agent_one_as_release_atomicrmw( ; GFX12-WGP-LABEL: private_agent_one_as_release_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm @@ -13974,8 +18258,14 @@ define amdgpu_kernel void @private_agent_one_as_release_atomicrmw( ; GFX12-CU-LABEL: private_agent_one_as_release_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm @@ -13983,29 +18273,34 @@ define amdgpu_kernel void @private_agent_one_as_release_atomicrmw( ; GFX1250-LABEL: private_agent_one_as_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 ; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -14027,8 +18322,15 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -14039,8 +18341,15 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -14051,8 +18360,15 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -14063,8 +18379,15 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -14079,8 +18402,15 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -14091,8 +18421,15 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -14103,8 +18440,15 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -14113,8 +18457,14 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_agent_one_as_acq_rel_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -14122,8 +18472,14 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_agent_one_as_acq_rel_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm @@ -14131,8 +18487,14 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_atomicrmw( ; GFX11-WGP-LABEL: private_agent_one_as_acq_rel_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm @@ -14140,8 +18502,14 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_atomicrmw( ; GFX11-CU-LABEL: private_agent_one_as_acq_rel_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm @@ -14149,8 +18517,14 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_atomicrmw( ; GFX12-WGP-LABEL: private_agent_one_as_acq_rel_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm @@ -14158,8 +18532,14 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_atomicrmw( ; GFX12-CU-LABEL: private_agent_one_as_acq_rel_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm @@ -14167,29 +18547,34 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_atomicrmw( ; GFX1250-LABEL: private_agent_one_as_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 ; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -14214,8 +18599,15 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -14226,8 +18618,15 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -14238,8 +18637,15 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -14250,8 +18656,15 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -14266,8 +18679,15 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -14278,8 +18698,15 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -14290,8 +18717,15 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -14300,8 +18734,14 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_agent_one_as_seq_cst_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -14309,8 +18749,14 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_agent_one_as_seq_cst_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm @@ -14318,8 +18764,14 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_atomicrmw( ; GFX11-WGP-LABEL: private_agent_one_as_seq_cst_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm @@ -14327,8 +18779,14 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_atomicrmw( ; GFX11-CU-LABEL: private_agent_one_as_seq_cst_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm @@ -14336,8 +18794,14 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_atomicrmw( ; GFX12-WGP-LABEL: private_agent_one_as_seq_cst_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm @@ -14345,8 +18809,14 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_atomicrmw( ; GFX12-CU-LABEL: private_agent_one_as_seq_cst_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm @@ -14354,29 +18824,34 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_atomicrmw( ; GFX1250-LABEL: private_agent_one_as_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 ; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -14401,6 +18876,10 @@ define amdgpu_kernel void @private_agent_one_as_acquire_ret_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -14419,6 +18898,10 @@ define amdgpu_kernel void @private_agent_one_as_acquire_ret_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -14436,6 +18919,10 @@ define amdgpu_kernel void @private_agent_one_as_acquire_ret_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -14453,6 +18940,10 @@ define amdgpu_kernel void @private_agent_one_as_acquire_ret_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -14474,6 +18965,10 @@ define amdgpu_kernel void @private_agent_one_as_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -14491,6 +18986,10 @@ define amdgpu_kernel void @private_agent_one_as_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -14508,6 +19007,10 @@ define amdgpu_kernel void @private_agent_one_as_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -14523,6 +19026,10 @@ define amdgpu_kernel void @private_agent_one_as_acquire_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_agent_one_as_acquire_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 @@ -14535,6 +19042,10 @@ define amdgpu_kernel void @private_agent_one_as_acquire_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_agent_one_as_acquire_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 @@ -14547,6 +19058,10 @@ define amdgpu_kernel void @private_agent_one_as_acquire_ret_atomicrmw( ; GFX11-WGP-LABEL: private_agent_one_as_acquire_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 @@ -14559,6 +19074,10 @@ define amdgpu_kernel void @private_agent_one_as_acquire_ret_atomicrmw( ; GFX11-CU-LABEL: private_agent_one_as_acquire_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 @@ -14571,6 +19090,10 @@ define amdgpu_kernel void @private_agent_one_as_acquire_ret_atomicrmw( ; GFX12-WGP-LABEL: private_agent_one_as_acquire_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 @@ -14583,6 +19106,10 @@ define amdgpu_kernel void @private_agent_one_as_acquire_ret_atomicrmw( ; GFX12-CU-LABEL: private_agent_one_as_acquire_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 @@ -14596,22 +19123,26 @@ define amdgpu_kernel void @private_agent_one_as_acquire_ret_atomicrmw( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_lg_u32 s0, s2 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b32 s2, 0 ; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s0, s3 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -14639,6 +19170,10 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_ret_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -14657,6 +19192,10 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_ret_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -14674,6 +19213,10 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_ret_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -14691,6 +19234,10 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_ret_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -14712,6 +19259,10 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -14729,6 +19280,10 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -14746,6 +19301,10 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -14761,6 +19320,10 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_agent_one_as_acq_rel_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 @@ -14773,6 +19336,10 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_agent_one_as_acq_rel_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 @@ -14785,6 +19352,10 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_ret_atomicrmw( ; GFX11-WGP-LABEL: private_agent_one_as_acq_rel_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 @@ -14797,6 +19368,10 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_ret_atomicrmw( ; GFX11-CU-LABEL: private_agent_one_as_acq_rel_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 @@ -14809,6 +19384,10 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_ret_atomicrmw( ; GFX12-WGP-LABEL: private_agent_one_as_acq_rel_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 @@ -14821,6 +19400,10 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-LABEL: private_agent_one_as_acq_rel_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 @@ -14834,22 +19417,26 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_ret_atomicrmw( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_lg_u32 s0, s2 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b32 s2, 0 ; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s0, s3 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -14882,6 +19469,10 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_ret_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -14900,6 +19491,10 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_ret_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -14917,6 +19512,10 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_ret_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -14934,6 +19533,10 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_ret_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -14955,6 +19558,10 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -14972,6 +19579,10 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -14989,6 +19600,10 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -15004,6 +19619,10 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_agent_one_as_seq_cst_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 @@ -15016,6 +19635,10 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_agent_one_as_seq_cst_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 @@ -15028,6 +19651,10 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_ret_atomicrmw( ; GFX11-WGP-LABEL: private_agent_one_as_seq_cst_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 @@ -15040,6 +19667,10 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_ret_atomicrmw( ; GFX11-CU-LABEL: private_agent_one_as_seq_cst_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 @@ -15052,6 +19683,10 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_ret_atomicrmw( ; GFX12-WGP-LABEL: private_agent_one_as_seq_cst_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 @@ -15064,6 +19699,10 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-LABEL: private_agent_one_as_seq_cst_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 @@ -15077,22 +19716,26 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_ret_atomicrmw( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_lg_u32 s0, s2 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b32 s2, 0 ; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s0, s3 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -15125,6 +19768,12 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_monotonic_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -15145,6 +19794,12 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_monotonic_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -15165,6 +19820,12 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -15182,6 +19843,12 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -15203,6 +19870,12 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -15223,6 +19896,12 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15241,6 +19920,12 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15257,6 +19942,12 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -15274,6 +19965,12 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -15291,6 +19988,12 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_monotonic_cmpxchg( ; GFX11-WGP-LABEL: private_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -15306,6 +20009,12 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_monotonic_cmpxchg( ; GFX11-CU-LABEL: private_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -15321,6 +20030,12 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_monotonic_cmpxchg( ; GFX12-WGP-LABEL: private_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -15335,6 +20050,12 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-LABEL: private_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -15349,26 +20070,32 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_monotonic_cmpxchg( ; GFX1250-LABEL: private_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -15376,6 +20103,7 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_monotonic_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -15395,6 +20123,12 @@ define amdgpu_kernel void @private_agent_one_as_acquire_monotonic_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -15415,6 +20149,12 @@ define amdgpu_kernel void @private_agent_one_as_acquire_monotonic_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -15435,6 +20175,12 @@ define amdgpu_kernel void @private_agent_one_as_acquire_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -15452,6 +20198,12 @@ define amdgpu_kernel void @private_agent_one_as_acquire_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -15473,6 +20225,12 @@ define amdgpu_kernel void @private_agent_one_as_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -15493,6 +20251,12 @@ define amdgpu_kernel void @private_agent_one_as_acquire_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15511,6 +20275,12 @@ define amdgpu_kernel void @private_agent_one_as_acquire_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15527,6 +20297,12 @@ define amdgpu_kernel void @private_agent_one_as_acquire_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_agent_one_as_acquire_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -15544,6 +20320,12 @@ define amdgpu_kernel void @private_agent_one_as_acquire_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_agent_one_as_acquire_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -15561,6 +20343,12 @@ define amdgpu_kernel void @private_agent_one_as_acquire_monotonic_cmpxchg( ; GFX11-WGP-LABEL: private_agent_one_as_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -15576,6 +20364,12 @@ define amdgpu_kernel void @private_agent_one_as_acquire_monotonic_cmpxchg( ; GFX11-CU-LABEL: private_agent_one_as_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -15591,6 +20385,12 @@ define amdgpu_kernel void @private_agent_one_as_acquire_monotonic_cmpxchg( ; GFX12-WGP-LABEL: private_agent_one_as_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -15605,6 +20405,12 @@ define amdgpu_kernel void @private_agent_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-LABEL: private_agent_one_as_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -15619,26 +20425,32 @@ define amdgpu_kernel void @private_agent_one_as_acquire_monotonic_cmpxchg( ; GFX1250-LABEL: private_agent_one_as_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -15646,6 +20458,7 @@ define amdgpu_kernel void @private_agent_one_as_acquire_monotonic_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -15668,6 +20481,12 @@ define amdgpu_kernel void @private_agent_one_as_release_monotonic_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -15688,6 +20507,12 @@ define amdgpu_kernel void @private_agent_one_as_release_monotonic_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -15708,6 +20533,12 @@ define amdgpu_kernel void @private_agent_one_as_release_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -15725,6 +20556,12 @@ define amdgpu_kernel void @private_agent_one_as_release_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -15746,6 +20583,12 @@ define amdgpu_kernel void @private_agent_one_as_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -15766,6 +20609,12 @@ define amdgpu_kernel void @private_agent_one_as_release_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15784,6 +20633,12 @@ define amdgpu_kernel void @private_agent_one_as_release_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15800,6 +20655,12 @@ define amdgpu_kernel void @private_agent_one_as_release_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_agent_one_as_release_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -15817,6 +20678,12 @@ define amdgpu_kernel void @private_agent_one_as_release_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_agent_one_as_release_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -15834,6 +20701,12 @@ define amdgpu_kernel void @private_agent_one_as_release_monotonic_cmpxchg( ; GFX11-WGP-LABEL: private_agent_one_as_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -15849,6 +20722,12 @@ define amdgpu_kernel void @private_agent_one_as_release_monotonic_cmpxchg( ; GFX11-CU-LABEL: private_agent_one_as_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -15864,6 +20743,12 @@ define amdgpu_kernel void @private_agent_one_as_release_monotonic_cmpxchg( ; GFX12-WGP-LABEL: private_agent_one_as_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -15878,6 +20763,12 @@ define amdgpu_kernel void @private_agent_one_as_release_monotonic_cmpxchg( ; GFX12-CU-LABEL: private_agent_one_as_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -15892,26 +20783,32 @@ define amdgpu_kernel void @private_agent_one_as_release_monotonic_cmpxchg( ; GFX1250-LABEL: private_agent_one_as_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -15919,6 +20816,7 @@ define amdgpu_kernel void @private_agent_one_as_release_monotonic_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -15943,6 +20841,12 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -15963,6 +20867,12 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -15983,6 +20893,12 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16000,6 +20916,12 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16021,6 +20943,12 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -16041,6 +20969,12 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16059,6 +20993,12 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16075,6 +21015,12 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -16092,6 +21038,12 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -16109,6 +21061,12 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX11-WGP-LABEL: private_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -16124,6 +21082,12 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX11-CU-LABEL: private_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -16139,6 +21103,12 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-WGP-LABEL: private_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -16153,6 +21123,12 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-LABEL: private_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -16167,26 +21143,32 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX1250-LABEL: private_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -16194,6 +21176,7 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -16221,6 +21204,12 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -16241,6 +21230,12 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -16261,6 +21256,12 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16278,6 +21279,12 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16299,6 +21306,12 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -16319,6 +21332,12 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16337,6 +21356,12 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16353,6 +21378,12 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -16370,6 +21401,12 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -16387,6 +21424,12 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX11-WGP-LABEL: private_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -16402,6 +21445,12 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX11-CU-LABEL: private_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -16417,6 +21466,12 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-WGP-LABEL: private_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -16431,6 +21486,12 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-LABEL: private_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -16445,26 +21506,32 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX1250-LABEL: private_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -16472,6 +21539,7 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -16499,6 +21567,12 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_acquire_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -16519,6 +21593,12 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_acquire_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -16539,6 +21619,12 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16556,6 +21642,12 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_acquire_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16577,6 +21669,12 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -16597,6 +21695,12 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16615,6 +21719,12 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16631,6 +21741,12 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_agent_one_as_monotonic_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -16648,6 +21764,12 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_agent_one_as_monotonic_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -16665,6 +21787,12 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_acquire_cmpxchg( ; GFX11-WGP-LABEL: private_agent_one_as_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -16680,6 +21808,12 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_acquire_cmpxchg( ; GFX11-CU-LABEL: private_agent_one_as_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -16695,6 +21829,12 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_acquire_cmpxchg( ; GFX12-WGP-LABEL: private_agent_one_as_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -16709,6 +21849,12 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-LABEL: private_agent_one_as_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -16723,26 +21869,32 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_acquire_cmpxchg( ; GFX1250-LABEL: private_agent_one_as_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -16750,6 +21902,7 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_acquire_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -16772,6 +21925,12 @@ define amdgpu_kernel void @private_agent_one_as_acquire_acquire_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -16792,6 +21951,12 @@ define amdgpu_kernel void @private_agent_one_as_acquire_acquire_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -16812,6 +21977,12 @@ define amdgpu_kernel void @private_agent_one_as_acquire_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16829,6 +22000,12 @@ define amdgpu_kernel void @private_agent_one_as_acquire_acquire_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16850,6 +22027,12 @@ define amdgpu_kernel void @private_agent_one_as_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -16870,6 +22053,12 @@ define amdgpu_kernel void @private_agent_one_as_acquire_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16888,6 +22077,12 @@ define amdgpu_kernel void @private_agent_one_as_acquire_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16904,6 +22099,12 @@ define amdgpu_kernel void @private_agent_one_as_acquire_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_agent_one_as_acquire_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -16921,6 +22122,12 @@ define amdgpu_kernel void @private_agent_one_as_acquire_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_agent_one_as_acquire_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -16938,6 +22145,12 @@ define amdgpu_kernel void @private_agent_one_as_acquire_acquire_cmpxchg( ; GFX11-WGP-LABEL: private_agent_one_as_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -16953,6 +22166,12 @@ define amdgpu_kernel void @private_agent_one_as_acquire_acquire_cmpxchg( ; GFX11-CU-LABEL: private_agent_one_as_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -16968,6 +22187,12 @@ define amdgpu_kernel void @private_agent_one_as_acquire_acquire_cmpxchg( ; GFX12-WGP-LABEL: private_agent_one_as_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -16982,6 +22207,12 @@ define amdgpu_kernel void @private_agent_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-LABEL: private_agent_one_as_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -16996,26 +22227,32 @@ define amdgpu_kernel void @private_agent_one_as_acquire_acquire_cmpxchg( ; GFX1250-LABEL: private_agent_one_as_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -17023,6 +22260,7 @@ define amdgpu_kernel void @private_agent_one_as_acquire_acquire_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -17045,6 +22283,12 @@ define amdgpu_kernel void @private_agent_one_as_release_acquire_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -17065,6 +22309,12 @@ define amdgpu_kernel void @private_agent_one_as_release_acquire_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -17085,6 +22335,12 @@ define amdgpu_kernel void @private_agent_one_as_release_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17102,6 +22358,12 @@ define amdgpu_kernel void @private_agent_one_as_release_acquire_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17123,6 +22385,12 @@ define amdgpu_kernel void @private_agent_one_as_release_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -17143,6 +22411,12 @@ define amdgpu_kernel void @private_agent_one_as_release_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17161,6 +22435,12 @@ define amdgpu_kernel void @private_agent_one_as_release_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17177,6 +22457,12 @@ define amdgpu_kernel void @private_agent_one_as_release_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_agent_one_as_release_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -17194,6 +22480,12 @@ define amdgpu_kernel void @private_agent_one_as_release_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_agent_one_as_release_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -17211,6 +22503,12 @@ define amdgpu_kernel void @private_agent_one_as_release_acquire_cmpxchg( ; GFX11-WGP-LABEL: private_agent_one_as_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -17226,6 +22524,12 @@ define amdgpu_kernel void @private_agent_one_as_release_acquire_cmpxchg( ; GFX11-CU-LABEL: private_agent_one_as_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -17241,6 +22545,12 @@ define amdgpu_kernel void @private_agent_one_as_release_acquire_cmpxchg( ; GFX12-WGP-LABEL: private_agent_one_as_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -17255,6 +22565,12 @@ define amdgpu_kernel void @private_agent_one_as_release_acquire_cmpxchg( ; GFX12-CU-LABEL: private_agent_one_as_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -17269,26 +22585,32 @@ define amdgpu_kernel void @private_agent_one_as_release_acquire_cmpxchg( ; GFX1250-LABEL: private_agent_one_as_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -17296,6 +22618,7 @@ define amdgpu_kernel void @private_agent_one_as_release_acquire_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -17323,6 +22646,12 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -17343,6 +22672,12 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -17363,6 +22698,12 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17380,6 +22721,12 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17401,6 +22748,12 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -17421,6 +22774,12 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17439,6 +22798,12 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17455,6 +22820,12 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -17472,6 +22843,12 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -17489,6 +22866,12 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX11-WGP-LABEL: private_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -17504,6 +22887,12 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX11-CU-LABEL: private_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -17519,6 +22908,12 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX12-WGP-LABEL: private_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -17533,6 +22928,12 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-LABEL: private_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -17547,26 +22948,32 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX1250-LABEL: private_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -17574,6 +22981,7 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -17601,6 +23009,12 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -17621,6 +23035,12 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -17641,6 +23061,12 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17658,6 +23084,12 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17679,6 +23111,12 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -17699,6 +23137,12 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17717,6 +23161,12 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17733,6 +23183,12 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -17750,6 +23206,12 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -17767,6 +23229,12 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX11-WGP-LABEL: private_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -17782,6 +23250,12 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX11-CU-LABEL: private_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -17797,6 +23271,12 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX12-WGP-LABEL: private_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -17811,6 +23291,12 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-LABEL: private_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -17825,26 +23311,32 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX1250-LABEL: private_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -17852,6 +23344,7 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -17879,6 +23372,12 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -17899,6 +23398,12 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -17919,6 +23424,12 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17936,6 +23447,12 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17957,6 +23474,12 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -17977,6 +23500,12 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17995,6 +23524,12 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18011,6 +23546,12 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -18028,6 +23569,12 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -18045,6 +23592,12 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: private_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -18060,6 +23613,12 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX11-CU-LABEL: private_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -18075,6 +23634,12 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: private_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -18089,6 +23654,12 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-LABEL: private_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -18103,26 +23674,32 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX1250-LABEL: private_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -18130,6 +23707,7 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -18157,6 +23735,12 @@ define amdgpu_kernel void @private_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -18177,6 +23761,12 @@ define amdgpu_kernel void @private_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -18197,6 +23787,12 @@ define amdgpu_kernel void @private_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -18214,6 +23810,12 @@ define amdgpu_kernel void @private_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -18235,6 +23837,12 @@ define amdgpu_kernel void @private_agent_one_as_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -18255,6 +23863,12 @@ define amdgpu_kernel void @private_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18273,6 +23887,12 @@ define amdgpu_kernel void @private_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18289,6 +23909,12 @@ define amdgpu_kernel void @private_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -18306,6 +23932,12 @@ define amdgpu_kernel void @private_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -18323,6 +23955,12 @@ define amdgpu_kernel void @private_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: private_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -18338,6 +23976,12 @@ define amdgpu_kernel void @private_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX11-CU-LABEL: private_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -18353,6 +23997,12 @@ define amdgpu_kernel void @private_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: private_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -18367,6 +24017,12 @@ define amdgpu_kernel void @private_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-LABEL: private_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -18381,26 +24037,32 @@ define amdgpu_kernel void @private_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX1250-LABEL: private_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -18408,6 +24070,7 @@ define amdgpu_kernel void @private_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -18435,6 +24098,12 @@ define amdgpu_kernel void @private_agent_one_as_release_seq_cst_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -18455,6 +24124,12 @@ define amdgpu_kernel void @private_agent_one_as_release_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -18475,6 +24150,12 @@ define amdgpu_kernel void @private_agent_one_as_release_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -18492,6 +24173,12 @@ define amdgpu_kernel void @private_agent_one_as_release_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -18513,6 +24200,12 @@ define amdgpu_kernel void @private_agent_one_as_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -18533,6 +24226,12 @@ define amdgpu_kernel void @private_agent_one_as_release_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18551,6 +24250,12 @@ define amdgpu_kernel void @private_agent_one_as_release_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18567,6 +24272,12 @@ define amdgpu_kernel void @private_agent_one_as_release_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_agent_one_as_release_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -18584,6 +24295,12 @@ define amdgpu_kernel void @private_agent_one_as_release_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_agent_one_as_release_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -18601,6 +24318,12 @@ define amdgpu_kernel void @private_agent_one_as_release_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: private_agent_one_as_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -18616,6 +24339,12 @@ define amdgpu_kernel void @private_agent_one_as_release_seq_cst_cmpxchg( ; GFX11-CU-LABEL: private_agent_one_as_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -18631,6 +24360,12 @@ define amdgpu_kernel void @private_agent_one_as_release_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: private_agent_one_as_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -18645,6 +24380,12 @@ define amdgpu_kernel void @private_agent_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-LABEL: private_agent_one_as_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -18659,26 +24400,32 @@ define amdgpu_kernel void @private_agent_one_as_release_seq_cst_cmpxchg( ; GFX1250-LABEL: private_agent_one_as_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -18686,6 +24433,7 @@ define amdgpu_kernel void @private_agent_one_as_release_seq_cst_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -18713,6 +24461,12 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -18733,6 +24487,12 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -18753,6 +24513,12 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -18770,6 +24536,12 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -18791,6 +24563,12 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -18811,6 +24589,12 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18829,6 +24613,12 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18845,6 +24635,12 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -18862,6 +24658,12 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -18879,6 +24681,12 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: private_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -18894,6 +24702,12 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX11-CU-LABEL: private_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -18909,6 +24723,12 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: private_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -18923,6 +24743,12 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-LABEL: private_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -18937,26 +24763,32 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX1250-LABEL: private_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -18964,6 +24796,7 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -18991,6 +24824,12 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -19011,6 +24850,12 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -19031,6 +24876,12 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -19048,6 +24899,12 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -19069,6 +24926,12 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -19089,6 +24952,12 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19107,6 +24976,12 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19123,6 +24998,12 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -19140,6 +25021,12 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -19157,6 +25044,12 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: private_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -19172,6 +25065,12 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX11-CU-LABEL: private_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -19187,6 +25086,12 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: private_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -19201,6 +25106,12 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-LABEL: private_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -19215,26 +25126,32 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX1250-LABEL: private_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -19242,6 +25159,7 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -19270,6 +25188,12 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -19294,6 +25218,12 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -19316,6 +25246,12 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -19335,6 +25271,12 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -19359,6 +25301,12 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -19381,6 +25329,12 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19401,6 +25355,12 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19420,6 +25380,12 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -19440,6 +25406,12 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -19459,6 +25431,12 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: private_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -19475,6 +25453,12 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: private_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -19491,6 +25475,12 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: private_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -19506,6 +25496,12 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: private_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -19522,25 +25518,31 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -19572,6 +25574,12 @@ define amdgpu_kernel void @private_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -19596,6 +25604,12 @@ define amdgpu_kernel void @private_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -19618,6 +25632,12 @@ define amdgpu_kernel void @private_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -19637,6 +25657,12 @@ define amdgpu_kernel void @private_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -19661,6 +25687,12 @@ define amdgpu_kernel void @private_agent_one_as_acquire_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -19683,6 +25715,12 @@ define amdgpu_kernel void @private_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19703,6 +25741,12 @@ define amdgpu_kernel void @private_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19722,6 +25766,12 @@ define amdgpu_kernel void @private_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -19742,6 +25792,12 @@ define amdgpu_kernel void @private_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -19761,6 +25817,12 @@ define amdgpu_kernel void @private_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: private_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -19777,6 +25839,12 @@ define amdgpu_kernel void @private_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: private_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -19793,6 +25861,12 @@ define amdgpu_kernel void @private_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: private_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -19808,6 +25882,12 @@ define amdgpu_kernel void @private_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: private_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -19824,25 +25904,31 @@ define amdgpu_kernel void @private_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -19876,6 +25962,12 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -19900,6 +25992,12 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -19922,6 +26020,12 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -19941,6 +26045,12 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -19965,6 +26075,12 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -19987,6 +26103,12 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20007,6 +26129,12 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20026,6 +26154,12 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -20046,6 +26180,12 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -20065,6 +26205,12 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: private_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -20081,6 +26227,12 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: private_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -20097,6 +26249,12 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: private_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -20112,6 +26270,12 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: private_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -20128,25 +26292,31 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -20185,6 +26355,12 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -20209,6 +26385,12 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -20231,6 +26413,12 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -20250,6 +26438,12 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -20274,6 +26468,12 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -20296,6 +26496,12 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20316,6 +26522,12 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20335,6 +26547,12 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -20355,6 +26573,12 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -20374,6 +26598,12 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: private_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -20390,6 +26620,12 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: private_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -20406,6 +26642,12 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: private_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -20421,6 +26663,12 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: private_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -20437,25 +26685,31 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -20494,6 +26748,12 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -20518,6 +26778,12 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -20540,6 +26806,12 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -20559,6 +26831,12 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -20583,6 +26861,12 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -20605,6 +26889,12 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20625,6 +26915,12 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20644,6 +26940,12 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -20664,6 +26966,12 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -20683,6 +26991,12 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: private_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -20699,6 +27013,12 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: private_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -20715,6 +27035,12 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: private_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -20730,6 +27056,12 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: private_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -20746,25 +27078,31 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -20798,6 +27136,12 @@ define amdgpu_kernel void @private_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -20822,6 +27166,12 @@ define amdgpu_kernel void @private_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -20844,6 +27194,12 @@ define amdgpu_kernel void @private_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -20863,6 +27219,12 @@ define amdgpu_kernel void @private_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -20887,6 +27249,12 @@ define amdgpu_kernel void @private_agent_one_as_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -20909,6 +27277,12 @@ define amdgpu_kernel void @private_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20929,6 +27303,12 @@ define amdgpu_kernel void @private_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20948,6 +27328,12 @@ define amdgpu_kernel void @private_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -20968,6 +27354,12 @@ define amdgpu_kernel void @private_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -20987,6 +27379,12 @@ define amdgpu_kernel void @private_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: private_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -21003,6 +27401,12 @@ define amdgpu_kernel void @private_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: private_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -21019,6 +27423,12 @@ define amdgpu_kernel void @private_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: private_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -21034,6 +27444,12 @@ define amdgpu_kernel void @private_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: private_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -21050,25 +27466,31 @@ define amdgpu_kernel void @private_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -21102,6 +27524,12 @@ define amdgpu_kernel void @private_agent_one_as_release_acquire_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -21126,6 +27554,12 @@ define amdgpu_kernel void @private_agent_one_as_release_acquire_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -21148,6 +27582,12 @@ define amdgpu_kernel void @private_agent_one_as_release_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -21167,6 +27607,12 @@ define amdgpu_kernel void @private_agent_one_as_release_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -21191,6 +27637,12 @@ define amdgpu_kernel void @private_agent_one_as_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -21213,6 +27665,12 @@ define amdgpu_kernel void @private_agent_one_as_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21233,6 +27691,12 @@ define amdgpu_kernel void @private_agent_one_as_release_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21252,6 +27716,12 @@ define amdgpu_kernel void @private_agent_one_as_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -21272,6 +27742,12 @@ define amdgpu_kernel void @private_agent_one_as_release_acquire_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -21291,6 +27767,12 @@ define amdgpu_kernel void @private_agent_one_as_release_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: private_agent_one_as_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -21307,6 +27789,12 @@ define amdgpu_kernel void @private_agent_one_as_release_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: private_agent_one_as_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -21323,6 +27811,12 @@ define amdgpu_kernel void @private_agent_one_as_release_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: private_agent_one_as_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -21338,6 +27832,12 @@ define amdgpu_kernel void @private_agent_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: private_agent_one_as_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -21354,25 +27854,31 @@ define amdgpu_kernel void @private_agent_one_as_release_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -21411,6 +27917,12 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -21435,6 +27947,12 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -21457,6 +27975,12 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -21476,6 +28000,12 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -21500,6 +28030,12 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -21522,6 +28058,12 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21542,6 +28084,12 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21561,6 +28109,12 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -21581,6 +28135,12 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -21600,6 +28160,12 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: private_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -21616,6 +28182,12 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: private_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -21632,6 +28204,12 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: private_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -21647,6 +28225,12 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: private_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -21663,25 +28247,31 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -21720,6 +28310,12 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -21744,6 +28340,12 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -21766,6 +28368,12 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -21785,6 +28393,12 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -21809,6 +28423,12 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -21831,6 +28451,12 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21851,6 +28477,12 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21870,6 +28502,12 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -21890,6 +28528,12 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -21909,6 +28553,12 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: private_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -21925,6 +28575,12 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: private_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -21941,6 +28597,12 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: private_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -21956,6 +28618,12 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: private_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -21972,25 +28640,31 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -22029,6 +28703,12 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -22053,6 +28733,12 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -22075,6 +28761,12 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -22094,6 +28786,12 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -22118,6 +28816,12 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -22140,6 +28844,12 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -22160,6 +28870,12 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -22179,6 +28895,12 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -22199,6 +28921,12 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -22218,6 +28946,12 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: private_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -22234,6 +28968,12 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: private_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -22250,6 +28990,12 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: private_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -22265,6 +29011,12 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: private_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -22281,25 +29033,31 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -22338,6 +29096,12 @@ define amdgpu_kernel void @private_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -22362,6 +29126,12 @@ define amdgpu_kernel void @private_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -22384,6 +29154,12 @@ define amdgpu_kernel void @private_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -22403,6 +29179,12 @@ define amdgpu_kernel void @private_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -22427,6 +29209,12 @@ define amdgpu_kernel void @private_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -22449,6 +29237,12 @@ define amdgpu_kernel void @private_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -22469,6 +29263,12 @@ define amdgpu_kernel void @private_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -22488,6 +29288,12 @@ define amdgpu_kernel void @private_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -22508,6 +29314,12 @@ define amdgpu_kernel void @private_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -22527,6 +29339,12 @@ define amdgpu_kernel void @private_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: private_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -22543,6 +29361,12 @@ define amdgpu_kernel void @private_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: private_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -22559,6 +29383,12 @@ define amdgpu_kernel void @private_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: private_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -22574,6 +29404,12 @@ define amdgpu_kernel void @private_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: private_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -22590,25 +29426,31 @@ define amdgpu_kernel void @private_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -22647,6 +29489,12 @@ define amdgpu_kernel void @private_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -22671,6 +29519,12 @@ define amdgpu_kernel void @private_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -22693,6 +29547,12 @@ define amdgpu_kernel void @private_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -22712,6 +29572,12 @@ define amdgpu_kernel void @private_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -22736,6 +29602,12 @@ define amdgpu_kernel void @private_agent_one_as_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -22758,6 +29630,12 @@ define amdgpu_kernel void @private_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -22778,6 +29656,12 @@ define amdgpu_kernel void @private_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -22797,6 +29681,12 @@ define amdgpu_kernel void @private_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -22817,6 +29707,12 @@ define amdgpu_kernel void @private_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -22836,6 +29732,12 @@ define amdgpu_kernel void @private_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: private_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -22852,6 +29754,12 @@ define amdgpu_kernel void @private_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: private_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -22868,6 +29776,12 @@ define amdgpu_kernel void @private_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: private_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -22883,6 +29797,12 @@ define amdgpu_kernel void @private_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: private_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -22899,25 +29819,31 @@ define amdgpu_kernel void @private_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -22956,6 +29882,12 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -22980,6 +29912,12 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -23002,6 +29940,12 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -23021,6 +29965,12 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -23045,6 +29995,12 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -23067,6 +30023,12 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -23087,6 +30049,12 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -23106,6 +30074,12 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -23126,6 +30100,12 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -23145,6 +30125,12 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: private_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -23161,6 +30147,12 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: private_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -23177,6 +30169,12 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: private_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -23192,6 +30190,12 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: private_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -23208,25 +30212,31 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -23265,6 +30275,12 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -23289,6 +30305,12 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -23311,6 +30333,12 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -23330,6 +30358,12 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -23354,6 +30388,12 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -23376,6 +30416,12 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -23396,6 +30442,12 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -23415,6 +30467,12 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -23435,6 +30493,12 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -23454,6 +30518,12 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: private_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -23470,6 +30540,12 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: private_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -23486,6 +30562,12 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: private_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -23501,6 +30583,12 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: private_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -23517,25 +30605,31 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-cluster.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-cluster.ll index 33dbd9e50b52..c183a8642172 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-cluster.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-cluster.ll @@ -19,11 +19,15 @@ define amdgpu_kernel void @private_cluster_unordered_load( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -33,11 +37,15 @@ define amdgpu_kernel void @private_cluster_unordered_load( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -47,11 +55,15 @@ define amdgpu_kernel void @private_cluster_unordered_load( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -61,11 +73,15 @@ define amdgpu_kernel void @private_cluster_unordered_load( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -79,11 +95,15 @@ define amdgpu_kernel void @private_cluster_unordered_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -93,11 +113,15 @@ define amdgpu_kernel void @private_cluster_unordered_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -107,11 +131,15 @@ define amdgpu_kernel void @private_cluster_unordered_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -119,84 +147,108 @@ define amdgpu_kernel void @private_cluster_unordered_load( ; ; GFX942-NOTTGSPLIT-LABEL: private_cluster_unordered_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_cluster_unordered_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_cluster_unordered_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_cluster_unordered_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_cluster_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_cluster_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: private_cluster_unordered_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -205,6 +257,7 @@ define amdgpu_kernel void @private_cluster_unordered_load( ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: flat_load_b32 v0, v[0:1] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %in, ptr addrspace(5) %out) { @@ -219,11 +272,15 @@ define amdgpu_kernel void @private_cluster_monotonic_load( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -233,11 +290,15 @@ define amdgpu_kernel void @private_cluster_monotonic_load( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -247,11 +308,15 @@ define amdgpu_kernel void @private_cluster_monotonic_load( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -261,11 +326,15 @@ define amdgpu_kernel void @private_cluster_monotonic_load( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -279,11 +348,15 @@ define amdgpu_kernel void @private_cluster_monotonic_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -293,11 +366,15 @@ define amdgpu_kernel void @private_cluster_monotonic_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -307,11 +384,15 @@ define amdgpu_kernel void @private_cluster_monotonic_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -319,84 +400,108 @@ define amdgpu_kernel void @private_cluster_monotonic_load( ; ; GFX942-NOTTGSPLIT-LABEL: private_cluster_monotonic_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_cluster_monotonic_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_cluster_monotonic_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_cluster_monotonic_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_cluster_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_cluster_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: private_cluster_monotonic_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -405,6 +510,7 @@ define amdgpu_kernel void @private_cluster_monotonic_load( ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_SE ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %in, ptr addrspace(5) %out) { @@ -419,11 +525,15 @@ define amdgpu_kernel void @private_cluster_acquire_load( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -433,11 +543,15 @@ define amdgpu_kernel void @private_cluster_acquire_load( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -447,11 +561,15 @@ define amdgpu_kernel void @private_cluster_acquire_load( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -461,11 +579,15 @@ define amdgpu_kernel void @private_cluster_acquire_load( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -479,11 +601,15 @@ define amdgpu_kernel void @private_cluster_acquire_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -493,11 +619,15 @@ define amdgpu_kernel void @private_cluster_acquire_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -507,11 +637,15 @@ define amdgpu_kernel void @private_cluster_acquire_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -519,84 +653,108 @@ define amdgpu_kernel void @private_cluster_acquire_load( ; ; GFX942-NOTTGSPLIT-LABEL: private_cluster_acquire_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_cluster_acquire_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_cluster_acquire_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_cluster_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_cluster_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_cluster_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: private_cluster_acquire_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -606,6 +764,7 @@ define amdgpu_kernel void @private_cluster_acquire_load( ; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_SE ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %in, ptr addrspace(5) %out) { @@ -620,11 +779,15 @@ define amdgpu_kernel void @private_cluster_seq_cst_load( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -634,11 +797,15 @@ define amdgpu_kernel void @private_cluster_seq_cst_load( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -648,11 +815,15 @@ define amdgpu_kernel void @private_cluster_seq_cst_load( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -662,11 +833,15 @@ define amdgpu_kernel void @private_cluster_seq_cst_load( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -680,11 +855,15 @@ define amdgpu_kernel void @private_cluster_seq_cst_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -694,11 +873,15 @@ define amdgpu_kernel void @private_cluster_seq_cst_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -708,11 +891,15 @@ define amdgpu_kernel void @private_cluster_seq_cst_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -720,84 +907,108 @@ define amdgpu_kernel void @private_cluster_seq_cst_load( ; ; GFX942-NOTTGSPLIT-LABEL: private_cluster_seq_cst_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_cluster_seq_cst_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_cluster_seq_cst_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_cluster_seq_cst_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_cluster_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_cluster_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: private_cluster_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -809,6 +1020,7 @@ define amdgpu_kernel void @private_cluster_seq_cst_load( ; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_SE ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %in, ptr addrspace(5) %out) { @@ -823,10 +1035,14 @@ define amdgpu_kernel void @private_cluster_unordered_store( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX6-NEXT: s_endpgm @@ -835,10 +1051,14 @@ define amdgpu_kernel void @private_cluster_unordered_store( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX7-NEXT: s_endpgm @@ -847,10 +1067,14 @@ define amdgpu_kernel void @private_cluster_unordered_store( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-WGP-NEXT: s_endpgm @@ -859,10 +1083,14 @@ define amdgpu_kernel void @private_cluster_unordered_store( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-CU-NEXT: s_endpgm @@ -875,10 +1103,14 @@ define amdgpu_kernel void @private_cluster_unordered_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -887,10 +1119,14 @@ define amdgpu_kernel void @private_cluster_unordered_store( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -899,65 +1135,93 @@ define amdgpu_kernel void @private_cluster_unordered_store( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: private_cluster_unordered_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_cluster_unordered_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_cluster_unordered_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_cluster_unordered_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_cluster_unordered_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_cluster_unordered_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; @@ -965,22 +1229,26 @@ define amdgpu_kernel void @private_cluster_unordered_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -1002,10 +1270,14 @@ define amdgpu_kernel void @private_cluster_monotonic_store( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX6-NEXT: s_endpgm @@ -1014,10 +1286,14 @@ define amdgpu_kernel void @private_cluster_monotonic_store( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX7-NEXT: s_endpgm @@ -1026,10 +1302,14 @@ define amdgpu_kernel void @private_cluster_monotonic_store( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-WGP-NEXT: s_endpgm @@ -1038,10 +1318,14 @@ define amdgpu_kernel void @private_cluster_monotonic_store( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-CU-NEXT: s_endpgm @@ -1054,10 +1338,14 @@ define amdgpu_kernel void @private_cluster_monotonic_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -1066,10 +1354,14 @@ define amdgpu_kernel void @private_cluster_monotonic_store( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -1078,65 +1370,93 @@ define amdgpu_kernel void @private_cluster_monotonic_store( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: private_cluster_monotonic_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_cluster_monotonic_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_cluster_monotonic_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_cluster_monotonic_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_cluster_monotonic_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_cluster_monotonic_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; @@ -1144,22 +1464,26 @@ define amdgpu_kernel void @private_cluster_monotonic_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -1181,10 +1505,14 @@ define amdgpu_kernel void @private_cluster_release_store( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX6-NEXT: s_endpgm @@ -1193,10 +1521,14 @@ define amdgpu_kernel void @private_cluster_release_store( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX7-NEXT: s_endpgm @@ -1205,10 +1537,14 @@ define amdgpu_kernel void @private_cluster_release_store( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-WGP-NEXT: s_endpgm @@ -1217,10 +1553,14 @@ define amdgpu_kernel void @private_cluster_release_store( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-CU-NEXT: s_endpgm @@ -1233,10 +1573,14 @@ define amdgpu_kernel void @private_cluster_release_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -1245,10 +1589,14 @@ define amdgpu_kernel void @private_cluster_release_store( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -1257,65 +1605,93 @@ define amdgpu_kernel void @private_cluster_release_store( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: private_cluster_release_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_cluster_release_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_cluster_release_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_cluster_release_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_cluster_release_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_cluster_release_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; @@ -1323,22 +1699,26 @@ define amdgpu_kernel void @private_cluster_release_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -1362,10 +1742,14 @@ define amdgpu_kernel void @private_cluster_seq_cst_store( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX6-NEXT: s_endpgm @@ -1374,10 +1758,14 @@ define amdgpu_kernel void @private_cluster_seq_cst_store( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX7-NEXT: s_endpgm @@ -1386,10 +1774,14 @@ define amdgpu_kernel void @private_cluster_seq_cst_store( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-WGP-NEXT: s_endpgm @@ -1398,10 +1790,14 @@ define amdgpu_kernel void @private_cluster_seq_cst_store( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-CU-NEXT: s_endpgm @@ -1414,10 +1810,14 @@ define amdgpu_kernel void @private_cluster_seq_cst_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -1426,10 +1826,14 @@ define amdgpu_kernel void @private_cluster_seq_cst_store( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -1438,65 +1842,93 @@ define amdgpu_kernel void @private_cluster_seq_cst_store( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: private_cluster_seq_cst_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_cluster_seq_cst_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_cluster_seq_cst_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_cluster_seq_cst_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_cluster_seq_cst_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_cluster_seq_cst_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; @@ -1504,22 +1936,26 @@ define amdgpu_kernel void @private_cluster_seq_cst_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -1544,8 +1980,15 @@ define amdgpu_kernel void @private_cluster_monotonic_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1556,8 +1999,15 @@ define amdgpu_kernel void @private_cluster_monotonic_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1568,8 +2018,15 @@ define amdgpu_kernel void @private_cluster_monotonic_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1580,8 +2037,15 @@ define amdgpu_kernel void @private_cluster_monotonic_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1596,8 +2060,15 @@ define amdgpu_kernel void @private_cluster_monotonic_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -1608,8 +2079,15 @@ define amdgpu_kernel void @private_cluster_monotonic_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1620,8 +2098,15 @@ define amdgpu_kernel void @private_cluster_monotonic_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1630,8 +2115,14 @@ define amdgpu_kernel void @private_cluster_monotonic_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_cluster_monotonic_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -1639,8 +2130,14 @@ define amdgpu_kernel void @private_cluster_monotonic_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_cluster_monotonic_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm @@ -1648,8 +2145,14 @@ define amdgpu_kernel void @private_cluster_monotonic_atomicrmw( ; GFX11-WGP-LABEL: private_cluster_monotonic_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm @@ -1657,8 +2160,14 @@ define amdgpu_kernel void @private_cluster_monotonic_atomicrmw( ; GFX11-CU-LABEL: private_cluster_monotonic_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm @@ -1666,8 +2175,14 @@ define amdgpu_kernel void @private_cluster_monotonic_atomicrmw( ; GFX12-WGP-LABEL: private_cluster_monotonic_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm @@ -1675,8 +2190,14 @@ define amdgpu_kernel void @private_cluster_monotonic_atomicrmw( ; GFX12-CU-LABEL: private_cluster_monotonic_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm @@ -1684,29 +2205,34 @@ define amdgpu_kernel void @private_cluster_monotonic_atomicrmw( ; GFX1250-LABEL: private_cluster_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 ; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SE @@ -1723,8 +2249,15 @@ define amdgpu_kernel void @private_cluster_acquire_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1735,8 +2268,15 @@ define amdgpu_kernel void @private_cluster_acquire_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1747,8 +2287,15 @@ define amdgpu_kernel void @private_cluster_acquire_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1759,8 +2306,15 @@ define amdgpu_kernel void @private_cluster_acquire_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1775,8 +2329,15 @@ define amdgpu_kernel void @private_cluster_acquire_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -1787,8 +2348,15 @@ define amdgpu_kernel void @private_cluster_acquire_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1799,8 +2367,15 @@ define amdgpu_kernel void @private_cluster_acquire_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1809,8 +2384,14 @@ define amdgpu_kernel void @private_cluster_acquire_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_cluster_acquire_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -1818,8 +2399,14 @@ define amdgpu_kernel void @private_cluster_acquire_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_cluster_acquire_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm @@ -1827,8 +2414,14 @@ define amdgpu_kernel void @private_cluster_acquire_atomicrmw( ; GFX11-WGP-LABEL: private_cluster_acquire_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm @@ -1836,8 +2429,14 @@ define amdgpu_kernel void @private_cluster_acquire_atomicrmw( ; GFX11-CU-LABEL: private_cluster_acquire_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm @@ -1845,8 +2444,14 @@ define amdgpu_kernel void @private_cluster_acquire_atomicrmw( ; GFX12-WGP-LABEL: private_cluster_acquire_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm @@ -1854,8 +2459,14 @@ define amdgpu_kernel void @private_cluster_acquire_atomicrmw( ; GFX12-CU-LABEL: private_cluster_acquire_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm @@ -1863,29 +2474,34 @@ define amdgpu_kernel void @private_cluster_acquire_atomicrmw( ; GFX1250-LABEL: private_cluster_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 ; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SE @@ -1904,8 +2520,15 @@ define amdgpu_kernel void @private_cluster_release_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1916,8 +2539,15 @@ define amdgpu_kernel void @private_cluster_release_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1928,8 +2558,15 @@ define amdgpu_kernel void @private_cluster_release_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1940,8 +2577,15 @@ define amdgpu_kernel void @private_cluster_release_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1956,8 +2600,15 @@ define amdgpu_kernel void @private_cluster_release_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -1968,8 +2619,15 @@ define amdgpu_kernel void @private_cluster_release_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1980,8 +2638,15 @@ define amdgpu_kernel void @private_cluster_release_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1990,8 +2655,14 @@ define amdgpu_kernel void @private_cluster_release_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_cluster_release_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -1999,8 +2670,14 @@ define amdgpu_kernel void @private_cluster_release_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_cluster_release_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm @@ -2008,8 +2685,14 @@ define amdgpu_kernel void @private_cluster_release_atomicrmw( ; GFX11-WGP-LABEL: private_cluster_release_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm @@ -2017,8 +2700,14 @@ define amdgpu_kernel void @private_cluster_release_atomicrmw( ; GFX11-CU-LABEL: private_cluster_release_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm @@ -2026,8 +2715,14 @@ define amdgpu_kernel void @private_cluster_release_atomicrmw( ; GFX12-WGP-LABEL: private_cluster_release_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm @@ -2035,8 +2730,14 @@ define amdgpu_kernel void @private_cluster_release_atomicrmw( ; GFX12-CU-LABEL: private_cluster_release_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm @@ -2044,29 +2745,34 @@ define amdgpu_kernel void @private_cluster_release_atomicrmw( ; GFX1250-LABEL: private_cluster_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 ; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x0 @@ -2085,8 +2791,15 @@ define amdgpu_kernel void @private_cluster_acq_rel_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -2097,8 +2810,15 @@ define amdgpu_kernel void @private_cluster_acq_rel_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -2109,8 +2829,15 @@ define amdgpu_kernel void @private_cluster_acq_rel_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -2121,8 +2848,15 @@ define amdgpu_kernel void @private_cluster_acq_rel_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -2137,8 +2871,15 @@ define amdgpu_kernel void @private_cluster_acq_rel_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -2149,8 +2890,15 @@ define amdgpu_kernel void @private_cluster_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -2161,8 +2909,15 @@ define amdgpu_kernel void @private_cluster_acq_rel_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -2171,8 +2926,14 @@ define amdgpu_kernel void @private_cluster_acq_rel_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_cluster_acq_rel_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -2180,8 +2941,14 @@ define amdgpu_kernel void @private_cluster_acq_rel_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_cluster_acq_rel_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm @@ -2189,8 +2956,14 @@ define amdgpu_kernel void @private_cluster_acq_rel_atomicrmw( ; GFX11-WGP-LABEL: private_cluster_acq_rel_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm @@ -2198,8 +2971,14 @@ define amdgpu_kernel void @private_cluster_acq_rel_atomicrmw( ; GFX11-CU-LABEL: private_cluster_acq_rel_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm @@ -2207,8 +2986,14 @@ define amdgpu_kernel void @private_cluster_acq_rel_atomicrmw( ; GFX12-WGP-LABEL: private_cluster_acq_rel_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm @@ -2216,8 +3001,14 @@ define amdgpu_kernel void @private_cluster_acq_rel_atomicrmw( ; GFX12-CU-LABEL: private_cluster_acq_rel_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm @@ -2225,29 +3016,34 @@ define amdgpu_kernel void @private_cluster_acq_rel_atomicrmw( ; GFX1250-LABEL: private_cluster_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 ; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x0 @@ -2268,8 +3064,15 @@ define amdgpu_kernel void @private_cluster_seq_cst_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -2280,8 +3083,15 @@ define amdgpu_kernel void @private_cluster_seq_cst_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -2292,8 +3102,15 @@ define amdgpu_kernel void @private_cluster_seq_cst_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -2304,8 +3121,15 @@ define amdgpu_kernel void @private_cluster_seq_cst_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -2320,8 +3144,15 @@ define amdgpu_kernel void @private_cluster_seq_cst_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -2332,8 +3163,15 @@ define amdgpu_kernel void @private_cluster_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -2344,8 +3182,15 @@ define amdgpu_kernel void @private_cluster_seq_cst_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -2354,8 +3199,14 @@ define amdgpu_kernel void @private_cluster_seq_cst_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_cluster_seq_cst_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -2363,8 +3214,14 @@ define amdgpu_kernel void @private_cluster_seq_cst_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_cluster_seq_cst_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm @@ -2372,8 +3229,14 @@ define amdgpu_kernel void @private_cluster_seq_cst_atomicrmw( ; GFX11-WGP-LABEL: private_cluster_seq_cst_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm @@ -2381,8 +3244,14 @@ define amdgpu_kernel void @private_cluster_seq_cst_atomicrmw( ; GFX11-CU-LABEL: private_cluster_seq_cst_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm @@ -2390,8 +3259,14 @@ define amdgpu_kernel void @private_cluster_seq_cst_atomicrmw( ; GFX12-WGP-LABEL: private_cluster_seq_cst_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm @@ -2399,8 +3274,14 @@ define amdgpu_kernel void @private_cluster_seq_cst_atomicrmw( ; GFX12-CU-LABEL: private_cluster_seq_cst_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm @@ -2408,29 +3289,34 @@ define amdgpu_kernel void @private_cluster_seq_cst_atomicrmw( ; GFX1250-LABEL: private_cluster_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 ; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x0 @@ -2451,6 +3337,10 @@ define amdgpu_kernel void @private_cluster_acquire_ret_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -2469,6 +3359,10 @@ define amdgpu_kernel void @private_cluster_acquire_ret_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -2486,6 +3380,10 @@ define amdgpu_kernel void @private_cluster_acquire_ret_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -2503,6 +3401,10 @@ define amdgpu_kernel void @private_cluster_acquire_ret_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -2524,6 +3426,10 @@ define amdgpu_kernel void @private_cluster_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -2541,6 +3447,10 @@ define amdgpu_kernel void @private_cluster_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -2558,6 +3468,10 @@ define amdgpu_kernel void @private_cluster_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -2573,6 +3487,10 @@ define amdgpu_kernel void @private_cluster_acquire_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_cluster_acquire_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 @@ -2585,6 +3503,10 @@ define amdgpu_kernel void @private_cluster_acquire_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_cluster_acquire_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 @@ -2597,6 +3519,10 @@ define amdgpu_kernel void @private_cluster_acquire_ret_atomicrmw( ; GFX11-WGP-LABEL: private_cluster_acquire_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 @@ -2609,6 +3535,10 @@ define amdgpu_kernel void @private_cluster_acquire_ret_atomicrmw( ; GFX11-CU-LABEL: private_cluster_acquire_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 @@ -2621,6 +3551,10 @@ define amdgpu_kernel void @private_cluster_acquire_ret_atomicrmw( ; GFX12-WGP-LABEL: private_cluster_acquire_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 @@ -2633,6 +3567,10 @@ define amdgpu_kernel void @private_cluster_acquire_ret_atomicrmw( ; GFX12-CU-LABEL: private_cluster_acquire_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 @@ -2646,22 +3584,26 @@ define amdgpu_kernel void @private_cluster_acquire_ret_atomicrmw( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_lg_u32 s0, s2 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b32 s2, 0 ; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s0, s3 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -2688,6 +3630,10 @@ define amdgpu_kernel void @private_cluster_acq_rel_ret_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -2706,6 +3652,10 @@ define amdgpu_kernel void @private_cluster_acq_rel_ret_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -2723,6 +3673,10 @@ define amdgpu_kernel void @private_cluster_acq_rel_ret_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -2740,6 +3694,10 @@ define amdgpu_kernel void @private_cluster_acq_rel_ret_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -2761,6 +3719,10 @@ define amdgpu_kernel void @private_cluster_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -2778,6 +3740,10 @@ define amdgpu_kernel void @private_cluster_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -2795,6 +3761,10 @@ define amdgpu_kernel void @private_cluster_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -2810,6 +3780,10 @@ define amdgpu_kernel void @private_cluster_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_cluster_acq_rel_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 @@ -2822,6 +3796,10 @@ define amdgpu_kernel void @private_cluster_acq_rel_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_cluster_acq_rel_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 @@ -2834,6 +3812,10 @@ define amdgpu_kernel void @private_cluster_acq_rel_ret_atomicrmw( ; GFX11-WGP-LABEL: private_cluster_acq_rel_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 @@ -2846,6 +3828,10 @@ define amdgpu_kernel void @private_cluster_acq_rel_ret_atomicrmw( ; GFX11-CU-LABEL: private_cluster_acq_rel_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 @@ -2858,6 +3844,10 @@ define amdgpu_kernel void @private_cluster_acq_rel_ret_atomicrmw( ; GFX12-WGP-LABEL: private_cluster_acq_rel_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 @@ -2870,6 +3860,10 @@ define amdgpu_kernel void @private_cluster_acq_rel_ret_atomicrmw( ; GFX12-CU-LABEL: private_cluster_acq_rel_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 @@ -2883,22 +3877,26 @@ define amdgpu_kernel void @private_cluster_acq_rel_ret_atomicrmw( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_lg_u32 s0, s2 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b32 s2, 0 ; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s0, s3 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -2927,6 +3925,10 @@ define amdgpu_kernel void @private_cluster_seq_cst_ret_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -2945,6 +3947,10 @@ define amdgpu_kernel void @private_cluster_seq_cst_ret_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -2962,6 +3968,10 @@ define amdgpu_kernel void @private_cluster_seq_cst_ret_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -2979,6 +3989,10 @@ define amdgpu_kernel void @private_cluster_seq_cst_ret_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -3000,6 +4014,10 @@ define amdgpu_kernel void @private_cluster_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -3017,6 +4035,10 @@ define amdgpu_kernel void @private_cluster_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -3034,6 +4056,10 @@ define amdgpu_kernel void @private_cluster_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -3049,6 +4075,10 @@ define amdgpu_kernel void @private_cluster_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_cluster_seq_cst_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 @@ -3061,6 +4091,10 @@ define amdgpu_kernel void @private_cluster_seq_cst_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_cluster_seq_cst_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 @@ -3073,6 +4107,10 @@ define amdgpu_kernel void @private_cluster_seq_cst_ret_atomicrmw( ; GFX11-WGP-LABEL: private_cluster_seq_cst_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 @@ -3085,6 +4123,10 @@ define amdgpu_kernel void @private_cluster_seq_cst_ret_atomicrmw( ; GFX11-CU-LABEL: private_cluster_seq_cst_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 @@ -3097,6 +4139,10 @@ define amdgpu_kernel void @private_cluster_seq_cst_ret_atomicrmw( ; GFX12-WGP-LABEL: private_cluster_seq_cst_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 @@ -3109,6 +4155,10 @@ define amdgpu_kernel void @private_cluster_seq_cst_ret_atomicrmw( ; GFX12-CU-LABEL: private_cluster_seq_cst_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 @@ -3122,22 +4172,26 @@ define amdgpu_kernel void @private_cluster_seq_cst_ret_atomicrmw( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_lg_u32 s0, s2 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b32 s2, 0 ; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s0, s3 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -3166,6 +4220,12 @@ define amdgpu_kernel void @private_cluster_monotonic_monotonic_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -3186,6 +4246,12 @@ define amdgpu_kernel void @private_cluster_monotonic_monotonic_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -3206,6 +4272,12 @@ define amdgpu_kernel void @private_cluster_monotonic_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -3223,6 +4295,12 @@ define amdgpu_kernel void @private_cluster_monotonic_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -3244,6 +4322,12 @@ define amdgpu_kernel void @private_cluster_monotonic_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -3264,6 +4348,12 @@ define amdgpu_kernel void @private_cluster_monotonic_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3282,6 +4372,12 @@ define amdgpu_kernel void @private_cluster_monotonic_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3298,6 +4394,12 @@ define amdgpu_kernel void @private_cluster_monotonic_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_cluster_monotonic_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -3315,6 +4417,12 @@ define amdgpu_kernel void @private_cluster_monotonic_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_cluster_monotonic_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -3332,6 +4440,12 @@ define amdgpu_kernel void @private_cluster_monotonic_monotonic_cmpxchg( ; GFX11-WGP-LABEL: private_cluster_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -3347,6 +4461,12 @@ define amdgpu_kernel void @private_cluster_monotonic_monotonic_cmpxchg( ; GFX11-CU-LABEL: private_cluster_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -3362,6 +4482,12 @@ define amdgpu_kernel void @private_cluster_monotonic_monotonic_cmpxchg( ; GFX12-WGP-LABEL: private_cluster_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -3376,6 +4502,12 @@ define amdgpu_kernel void @private_cluster_monotonic_monotonic_cmpxchg( ; GFX12-CU-LABEL: private_cluster_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -3390,26 +4522,32 @@ define amdgpu_kernel void @private_cluster_monotonic_monotonic_cmpxchg( ; GFX1250-LABEL: private_cluster_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -3417,6 +4555,7 @@ define amdgpu_kernel void @private_cluster_monotonic_monotonic_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -3436,6 +4575,12 @@ define amdgpu_kernel void @private_cluster_acquire_monotonic_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -3456,6 +4601,12 @@ define amdgpu_kernel void @private_cluster_acquire_monotonic_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -3476,6 +4627,12 @@ define amdgpu_kernel void @private_cluster_acquire_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -3493,6 +4650,12 @@ define amdgpu_kernel void @private_cluster_acquire_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -3514,6 +4677,12 @@ define amdgpu_kernel void @private_cluster_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -3534,6 +4703,12 @@ define amdgpu_kernel void @private_cluster_acquire_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3552,6 +4727,12 @@ define amdgpu_kernel void @private_cluster_acquire_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3568,6 +4749,12 @@ define amdgpu_kernel void @private_cluster_acquire_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_cluster_acquire_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -3585,6 +4772,12 @@ define amdgpu_kernel void @private_cluster_acquire_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_cluster_acquire_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -3602,6 +4795,12 @@ define amdgpu_kernel void @private_cluster_acquire_monotonic_cmpxchg( ; GFX11-WGP-LABEL: private_cluster_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -3617,6 +4816,12 @@ define amdgpu_kernel void @private_cluster_acquire_monotonic_cmpxchg( ; GFX11-CU-LABEL: private_cluster_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -3632,6 +4837,12 @@ define amdgpu_kernel void @private_cluster_acquire_monotonic_cmpxchg( ; GFX12-WGP-LABEL: private_cluster_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -3646,6 +4857,12 @@ define amdgpu_kernel void @private_cluster_acquire_monotonic_cmpxchg( ; GFX12-CU-LABEL: private_cluster_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -3660,26 +4877,32 @@ define amdgpu_kernel void @private_cluster_acquire_monotonic_cmpxchg( ; GFX1250-LABEL: private_cluster_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -3687,6 +4910,7 @@ define amdgpu_kernel void @private_cluster_acquire_monotonic_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -3708,6 +4932,12 @@ define amdgpu_kernel void @private_cluster_release_monotonic_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -3728,6 +4958,12 @@ define amdgpu_kernel void @private_cluster_release_monotonic_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -3748,6 +4984,12 @@ define amdgpu_kernel void @private_cluster_release_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -3765,6 +5007,12 @@ define amdgpu_kernel void @private_cluster_release_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -3786,6 +5034,12 @@ define amdgpu_kernel void @private_cluster_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -3806,6 +5060,12 @@ define amdgpu_kernel void @private_cluster_release_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3824,6 +5084,12 @@ define amdgpu_kernel void @private_cluster_release_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3840,6 +5106,12 @@ define amdgpu_kernel void @private_cluster_release_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_cluster_release_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -3857,6 +5129,12 @@ define amdgpu_kernel void @private_cluster_release_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_cluster_release_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -3874,6 +5152,12 @@ define amdgpu_kernel void @private_cluster_release_monotonic_cmpxchg( ; GFX11-WGP-LABEL: private_cluster_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -3889,6 +5173,12 @@ define amdgpu_kernel void @private_cluster_release_monotonic_cmpxchg( ; GFX11-CU-LABEL: private_cluster_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -3904,6 +5194,12 @@ define amdgpu_kernel void @private_cluster_release_monotonic_cmpxchg( ; GFX12-WGP-LABEL: private_cluster_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -3918,6 +5214,12 @@ define amdgpu_kernel void @private_cluster_release_monotonic_cmpxchg( ; GFX12-CU-LABEL: private_cluster_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -3932,26 +5234,32 @@ define amdgpu_kernel void @private_cluster_release_monotonic_cmpxchg( ; GFX1250-LABEL: private_cluster_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -3959,6 +5267,7 @@ define amdgpu_kernel void @private_cluster_release_monotonic_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -3980,6 +5289,12 @@ define amdgpu_kernel void @private_cluster_acq_rel_monotonic_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -4000,6 +5315,12 @@ define amdgpu_kernel void @private_cluster_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -4020,6 +5341,12 @@ define amdgpu_kernel void @private_cluster_acq_rel_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4037,6 +5364,12 @@ define amdgpu_kernel void @private_cluster_acq_rel_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4058,6 +5391,12 @@ define amdgpu_kernel void @private_cluster_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -4078,6 +5417,12 @@ define amdgpu_kernel void @private_cluster_acq_rel_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4096,6 +5441,12 @@ define amdgpu_kernel void @private_cluster_acq_rel_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4112,6 +5463,12 @@ define amdgpu_kernel void @private_cluster_acq_rel_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_cluster_acq_rel_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -4129,6 +5486,12 @@ define amdgpu_kernel void @private_cluster_acq_rel_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_cluster_acq_rel_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -4146,6 +5509,12 @@ define amdgpu_kernel void @private_cluster_acq_rel_monotonic_cmpxchg( ; GFX11-WGP-LABEL: private_cluster_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -4161,6 +5530,12 @@ define amdgpu_kernel void @private_cluster_acq_rel_monotonic_cmpxchg( ; GFX11-CU-LABEL: private_cluster_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -4176,6 +5551,12 @@ define amdgpu_kernel void @private_cluster_acq_rel_monotonic_cmpxchg( ; GFX12-WGP-LABEL: private_cluster_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -4190,6 +5571,12 @@ define amdgpu_kernel void @private_cluster_acq_rel_monotonic_cmpxchg( ; GFX12-CU-LABEL: private_cluster_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -4204,26 +5591,32 @@ define amdgpu_kernel void @private_cluster_acq_rel_monotonic_cmpxchg( ; GFX1250-LABEL: private_cluster_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -4231,6 +5624,7 @@ define amdgpu_kernel void @private_cluster_acq_rel_monotonic_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -4254,6 +5648,12 @@ define amdgpu_kernel void @private_cluster_seq_cst_monotonic_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -4274,6 +5674,12 @@ define amdgpu_kernel void @private_cluster_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -4294,6 +5700,12 @@ define amdgpu_kernel void @private_cluster_seq_cst_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4311,6 +5723,12 @@ define amdgpu_kernel void @private_cluster_seq_cst_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4332,6 +5750,12 @@ define amdgpu_kernel void @private_cluster_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -4352,6 +5776,12 @@ define amdgpu_kernel void @private_cluster_seq_cst_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4370,6 +5800,12 @@ define amdgpu_kernel void @private_cluster_seq_cst_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4386,6 +5822,12 @@ define amdgpu_kernel void @private_cluster_seq_cst_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_cluster_seq_cst_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -4403,6 +5845,12 @@ define amdgpu_kernel void @private_cluster_seq_cst_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_cluster_seq_cst_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -4420,6 +5868,12 @@ define amdgpu_kernel void @private_cluster_seq_cst_monotonic_cmpxchg( ; GFX11-WGP-LABEL: private_cluster_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -4435,6 +5889,12 @@ define amdgpu_kernel void @private_cluster_seq_cst_monotonic_cmpxchg( ; GFX11-CU-LABEL: private_cluster_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -4450,6 +5910,12 @@ define amdgpu_kernel void @private_cluster_seq_cst_monotonic_cmpxchg( ; GFX12-WGP-LABEL: private_cluster_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -4464,6 +5930,12 @@ define amdgpu_kernel void @private_cluster_seq_cst_monotonic_cmpxchg( ; GFX12-CU-LABEL: private_cluster_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -4478,26 +5950,32 @@ define amdgpu_kernel void @private_cluster_seq_cst_monotonic_cmpxchg( ; GFX1250-LABEL: private_cluster_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -4505,6 +5983,7 @@ define amdgpu_kernel void @private_cluster_seq_cst_monotonic_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -4528,6 +6007,12 @@ define amdgpu_kernel void @private_cluster_monotonic_acquire_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -4548,6 +6033,12 @@ define amdgpu_kernel void @private_cluster_monotonic_acquire_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -4568,6 +6059,12 @@ define amdgpu_kernel void @private_cluster_monotonic_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4585,6 +6082,12 @@ define amdgpu_kernel void @private_cluster_monotonic_acquire_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4606,6 +6109,12 @@ define amdgpu_kernel void @private_cluster_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -4626,6 +6135,12 @@ define amdgpu_kernel void @private_cluster_monotonic_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4644,6 +6159,12 @@ define amdgpu_kernel void @private_cluster_monotonic_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4660,6 +6181,12 @@ define amdgpu_kernel void @private_cluster_monotonic_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_cluster_monotonic_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -4677,6 +6204,12 @@ define amdgpu_kernel void @private_cluster_monotonic_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_cluster_monotonic_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -4694,6 +6227,12 @@ define amdgpu_kernel void @private_cluster_monotonic_acquire_cmpxchg( ; GFX11-WGP-LABEL: private_cluster_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -4709,6 +6248,12 @@ define amdgpu_kernel void @private_cluster_monotonic_acquire_cmpxchg( ; GFX11-CU-LABEL: private_cluster_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -4724,6 +6269,12 @@ define amdgpu_kernel void @private_cluster_monotonic_acquire_cmpxchg( ; GFX12-WGP-LABEL: private_cluster_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -4738,6 +6289,12 @@ define amdgpu_kernel void @private_cluster_monotonic_acquire_cmpxchg( ; GFX12-CU-LABEL: private_cluster_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -4752,26 +6309,32 @@ define amdgpu_kernel void @private_cluster_monotonic_acquire_cmpxchg( ; GFX1250-LABEL: private_cluster_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -4779,6 +6342,7 @@ define amdgpu_kernel void @private_cluster_monotonic_acquire_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -4800,6 +6364,12 @@ define amdgpu_kernel void @private_cluster_acquire_acquire_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -4820,6 +6390,12 @@ define amdgpu_kernel void @private_cluster_acquire_acquire_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -4840,6 +6416,12 @@ define amdgpu_kernel void @private_cluster_acquire_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4857,6 +6439,12 @@ define amdgpu_kernel void @private_cluster_acquire_acquire_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4878,6 +6466,12 @@ define amdgpu_kernel void @private_cluster_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -4898,6 +6492,12 @@ define amdgpu_kernel void @private_cluster_acquire_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4916,6 +6516,12 @@ define amdgpu_kernel void @private_cluster_acquire_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4932,6 +6538,12 @@ define amdgpu_kernel void @private_cluster_acquire_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_cluster_acquire_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -4949,6 +6561,12 @@ define amdgpu_kernel void @private_cluster_acquire_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_cluster_acquire_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -4966,6 +6584,12 @@ define amdgpu_kernel void @private_cluster_acquire_acquire_cmpxchg( ; GFX11-WGP-LABEL: private_cluster_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -4981,6 +6605,12 @@ define amdgpu_kernel void @private_cluster_acquire_acquire_cmpxchg( ; GFX11-CU-LABEL: private_cluster_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -4996,6 +6626,12 @@ define amdgpu_kernel void @private_cluster_acquire_acquire_cmpxchg( ; GFX12-WGP-LABEL: private_cluster_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -5010,6 +6646,12 @@ define amdgpu_kernel void @private_cluster_acquire_acquire_cmpxchg( ; GFX12-CU-LABEL: private_cluster_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -5024,26 +6666,32 @@ define amdgpu_kernel void @private_cluster_acquire_acquire_cmpxchg( ; GFX1250-LABEL: private_cluster_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -5051,6 +6699,7 @@ define amdgpu_kernel void @private_cluster_acquire_acquire_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -5072,6 +6721,12 @@ define amdgpu_kernel void @private_cluster_release_acquire_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -5092,6 +6747,12 @@ define amdgpu_kernel void @private_cluster_release_acquire_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -5112,6 +6773,12 @@ define amdgpu_kernel void @private_cluster_release_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5129,6 +6796,12 @@ define amdgpu_kernel void @private_cluster_release_acquire_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5150,6 +6823,12 @@ define amdgpu_kernel void @private_cluster_release_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -5170,6 +6849,12 @@ define amdgpu_kernel void @private_cluster_release_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5188,6 +6873,12 @@ define amdgpu_kernel void @private_cluster_release_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5204,6 +6895,12 @@ define amdgpu_kernel void @private_cluster_release_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_cluster_release_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -5221,6 +6918,12 @@ define amdgpu_kernel void @private_cluster_release_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_cluster_release_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -5238,6 +6941,12 @@ define amdgpu_kernel void @private_cluster_release_acquire_cmpxchg( ; GFX11-WGP-LABEL: private_cluster_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -5253,6 +6962,12 @@ define amdgpu_kernel void @private_cluster_release_acquire_cmpxchg( ; GFX11-CU-LABEL: private_cluster_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -5268,6 +6983,12 @@ define amdgpu_kernel void @private_cluster_release_acquire_cmpxchg( ; GFX12-WGP-LABEL: private_cluster_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -5282,6 +7003,12 @@ define amdgpu_kernel void @private_cluster_release_acquire_cmpxchg( ; GFX12-CU-LABEL: private_cluster_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -5296,26 +7023,32 @@ define amdgpu_kernel void @private_cluster_release_acquire_cmpxchg( ; GFX1250-LABEL: private_cluster_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -5323,6 +7056,7 @@ define amdgpu_kernel void @private_cluster_release_acquire_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -5346,6 +7080,12 @@ define amdgpu_kernel void @private_cluster_acq_rel_acquire_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -5366,6 +7106,12 @@ define amdgpu_kernel void @private_cluster_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -5386,6 +7132,12 @@ define amdgpu_kernel void @private_cluster_acq_rel_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5403,6 +7155,12 @@ define amdgpu_kernel void @private_cluster_acq_rel_acquire_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5424,6 +7182,12 @@ define amdgpu_kernel void @private_cluster_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -5444,6 +7208,12 @@ define amdgpu_kernel void @private_cluster_acq_rel_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5462,6 +7232,12 @@ define amdgpu_kernel void @private_cluster_acq_rel_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5478,6 +7254,12 @@ define amdgpu_kernel void @private_cluster_acq_rel_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_cluster_acq_rel_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -5495,6 +7277,12 @@ define amdgpu_kernel void @private_cluster_acq_rel_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_cluster_acq_rel_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -5512,6 +7300,12 @@ define amdgpu_kernel void @private_cluster_acq_rel_acquire_cmpxchg( ; GFX11-WGP-LABEL: private_cluster_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -5527,6 +7321,12 @@ define amdgpu_kernel void @private_cluster_acq_rel_acquire_cmpxchg( ; GFX11-CU-LABEL: private_cluster_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -5542,6 +7342,12 @@ define amdgpu_kernel void @private_cluster_acq_rel_acquire_cmpxchg( ; GFX12-WGP-LABEL: private_cluster_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -5556,6 +7362,12 @@ define amdgpu_kernel void @private_cluster_acq_rel_acquire_cmpxchg( ; GFX12-CU-LABEL: private_cluster_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -5570,26 +7382,32 @@ define amdgpu_kernel void @private_cluster_acq_rel_acquire_cmpxchg( ; GFX1250-LABEL: private_cluster_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -5597,6 +7415,7 @@ define amdgpu_kernel void @private_cluster_acq_rel_acquire_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -5620,6 +7439,12 @@ define amdgpu_kernel void @private_cluster_seq_cst_acquire_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -5640,6 +7465,12 @@ define amdgpu_kernel void @private_cluster_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -5660,6 +7491,12 @@ define amdgpu_kernel void @private_cluster_seq_cst_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5677,6 +7514,12 @@ define amdgpu_kernel void @private_cluster_seq_cst_acquire_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5698,6 +7541,12 @@ define amdgpu_kernel void @private_cluster_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -5718,6 +7567,12 @@ define amdgpu_kernel void @private_cluster_seq_cst_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5736,6 +7591,12 @@ define amdgpu_kernel void @private_cluster_seq_cst_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5752,6 +7613,12 @@ define amdgpu_kernel void @private_cluster_seq_cst_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_cluster_seq_cst_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -5769,6 +7636,12 @@ define amdgpu_kernel void @private_cluster_seq_cst_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_cluster_seq_cst_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -5786,6 +7659,12 @@ define amdgpu_kernel void @private_cluster_seq_cst_acquire_cmpxchg( ; GFX11-WGP-LABEL: private_cluster_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -5801,6 +7680,12 @@ define amdgpu_kernel void @private_cluster_seq_cst_acquire_cmpxchg( ; GFX11-CU-LABEL: private_cluster_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -5816,6 +7701,12 @@ define amdgpu_kernel void @private_cluster_seq_cst_acquire_cmpxchg( ; GFX12-WGP-LABEL: private_cluster_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -5830,6 +7721,12 @@ define amdgpu_kernel void @private_cluster_seq_cst_acquire_cmpxchg( ; GFX12-CU-LABEL: private_cluster_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -5844,26 +7741,32 @@ define amdgpu_kernel void @private_cluster_seq_cst_acquire_cmpxchg( ; GFX1250-LABEL: private_cluster_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -5871,6 +7774,7 @@ define amdgpu_kernel void @private_cluster_seq_cst_acquire_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -5894,6 +7798,12 @@ define amdgpu_kernel void @private_cluster_monotonic_seq_cst_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -5914,6 +7824,12 @@ define amdgpu_kernel void @private_cluster_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -5934,6 +7850,12 @@ define amdgpu_kernel void @private_cluster_monotonic_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5951,6 +7873,12 @@ define amdgpu_kernel void @private_cluster_monotonic_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5972,6 +7900,12 @@ define amdgpu_kernel void @private_cluster_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -5992,6 +7926,12 @@ define amdgpu_kernel void @private_cluster_monotonic_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6010,6 +7950,12 @@ define amdgpu_kernel void @private_cluster_monotonic_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6026,6 +7972,12 @@ define amdgpu_kernel void @private_cluster_monotonic_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_cluster_monotonic_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -6043,6 +7995,12 @@ define amdgpu_kernel void @private_cluster_monotonic_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_cluster_monotonic_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -6060,6 +8018,12 @@ define amdgpu_kernel void @private_cluster_monotonic_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: private_cluster_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -6075,6 +8039,12 @@ define amdgpu_kernel void @private_cluster_monotonic_seq_cst_cmpxchg( ; GFX11-CU-LABEL: private_cluster_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -6090,6 +8060,12 @@ define amdgpu_kernel void @private_cluster_monotonic_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: private_cluster_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -6104,6 +8080,12 @@ define amdgpu_kernel void @private_cluster_monotonic_seq_cst_cmpxchg( ; GFX12-CU-LABEL: private_cluster_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -6118,26 +8100,32 @@ define amdgpu_kernel void @private_cluster_monotonic_seq_cst_cmpxchg( ; GFX1250-LABEL: private_cluster_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -6145,6 +8133,7 @@ define amdgpu_kernel void @private_cluster_monotonic_seq_cst_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -6168,6 +8157,12 @@ define amdgpu_kernel void @private_cluster_acquire_seq_cst_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -6188,6 +8183,12 @@ define amdgpu_kernel void @private_cluster_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -6208,6 +8209,12 @@ define amdgpu_kernel void @private_cluster_acquire_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -6225,6 +8232,12 @@ define amdgpu_kernel void @private_cluster_acquire_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6246,6 +8259,12 @@ define amdgpu_kernel void @private_cluster_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -6266,6 +8285,12 @@ define amdgpu_kernel void @private_cluster_acquire_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6284,6 +8309,12 @@ define amdgpu_kernel void @private_cluster_acquire_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6300,6 +8331,12 @@ define amdgpu_kernel void @private_cluster_acquire_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_cluster_acquire_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -6317,6 +8354,12 @@ define amdgpu_kernel void @private_cluster_acquire_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_cluster_acquire_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -6334,6 +8377,12 @@ define amdgpu_kernel void @private_cluster_acquire_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: private_cluster_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -6349,6 +8398,12 @@ define amdgpu_kernel void @private_cluster_acquire_seq_cst_cmpxchg( ; GFX11-CU-LABEL: private_cluster_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -6364,6 +8419,12 @@ define amdgpu_kernel void @private_cluster_acquire_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: private_cluster_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -6378,6 +8439,12 @@ define amdgpu_kernel void @private_cluster_acquire_seq_cst_cmpxchg( ; GFX12-CU-LABEL: private_cluster_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -6392,26 +8459,32 @@ define amdgpu_kernel void @private_cluster_acquire_seq_cst_cmpxchg( ; GFX1250-LABEL: private_cluster_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -6419,6 +8492,7 @@ define amdgpu_kernel void @private_cluster_acquire_seq_cst_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -6442,6 +8516,12 @@ define amdgpu_kernel void @private_cluster_release_seq_cst_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -6462,6 +8542,12 @@ define amdgpu_kernel void @private_cluster_release_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -6482,6 +8568,12 @@ define amdgpu_kernel void @private_cluster_release_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -6499,6 +8591,12 @@ define amdgpu_kernel void @private_cluster_release_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6520,6 +8618,12 @@ define amdgpu_kernel void @private_cluster_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -6540,6 +8644,12 @@ define amdgpu_kernel void @private_cluster_release_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6558,6 +8668,12 @@ define amdgpu_kernel void @private_cluster_release_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6574,6 +8690,12 @@ define amdgpu_kernel void @private_cluster_release_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_cluster_release_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -6591,6 +8713,12 @@ define amdgpu_kernel void @private_cluster_release_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_cluster_release_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -6608,6 +8736,12 @@ define amdgpu_kernel void @private_cluster_release_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: private_cluster_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -6623,6 +8757,12 @@ define amdgpu_kernel void @private_cluster_release_seq_cst_cmpxchg( ; GFX11-CU-LABEL: private_cluster_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -6638,6 +8778,12 @@ define amdgpu_kernel void @private_cluster_release_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: private_cluster_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -6652,6 +8798,12 @@ define amdgpu_kernel void @private_cluster_release_seq_cst_cmpxchg( ; GFX12-CU-LABEL: private_cluster_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -6666,26 +8818,32 @@ define amdgpu_kernel void @private_cluster_release_seq_cst_cmpxchg( ; GFX1250-LABEL: private_cluster_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -6693,6 +8851,7 @@ define amdgpu_kernel void @private_cluster_release_seq_cst_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -6716,6 +8875,12 @@ define amdgpu_kernel void @private_cluster_acq_rel_seq_cst_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -6736,6 +8901,12 @@ define amdgpu_kernel void @private_cluster_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -6756,6 +8927,12 @@ define amdgpu_kernel void @private_cluster_acq_rel_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -6773,6 +8950,12 @@ define amdgpu_kernel void @private_cluster_acq_rel_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6794,6 +8977,12 @@ define amdgpu_kernel void @private_cluster_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -6814,6 +9003,12 @@ define amdgpu_kernel void @private_cluster_acq_rel_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6832,6 +9027,12 @@ define amdgpu_kernel void @private_cluster_acq_rel_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6848,6 +9049,12 @@ define amdgpu_kernel void @private_cluster_acq_rel_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_cluster_acq_rel_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -6865,6 +9072,12 @@ define amdgpu_kernel void @private_cluster_acq_rel_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_cluster_acq_rel_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -6882,6 +9095,12 @@ define amdgpu_kernel void @private_cluster_acq_rel_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: private_cluster_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -6897,6 +9116,12 @@ define amdgpu_kernel void @private_cluster_acq_rel_seq_cst_cmpxchg( ; GFX11-CU-LABEL: private_cluster_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -6912,6 +9137,12 @@ define amdgpu_kernel void @private_cluster_acq_rel_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: private_cluster_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -6926,6 +9157,12 @@ define amdgpu_kernel void @private_cluster_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-LABEL: private_cluster_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -6940,26 +9177,32 @@ define amdgpu_kernel void @private_cluster_acq_rel_seq_cst_cmpxchg( ; GFX1250-LABEL: private_cluster_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -6967,6 +9210,7 @@ define amdgpu_kernel void @private_cluster_acq_rel_seq_cst_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -6990,6 +9234,12 @@ define amdgpu_kernel void @private_cluster_seq_cst_seq_cst_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -7010,6 +9260,12 @@ define amdgpu_kernel void @private_cluster_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -7030,6 +9286,12 @@ define amdgpu_kernel void @private_cluster_seq_cst_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7047,6 +9309,12 @@ define amdgpu_kernel void @private_cluster_seq_cst_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -7068,6 +9336,12 @@ define amdgpu_kernel void @private_cluster_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -7088,6 +9362,12 @@ define amdgpu_kernel void @private_cluster_seq_cst_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7106,6 +9386,12 @@ define amdgpu_kernel void @private_cluster_seq_cst_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7122,6 +9408,12 @@ define amdgpu_kernel void @private_cluster_seq_cst_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_cluster_seq_cst_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -7139,6 +9431,12 @@ define amdgpu_kernel void @private_cluster_seq_cst_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_cluster_seq_cst_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -7156,6 +9454,12 @@ define amdgpu_kernel void @private_cluster_seq_cst_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: private_cluster_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -7171,6 +9475,12 @@ define amdgpu_kernel void @private_cluster_seq_cst_seq_cst_cmpxchg( ; GFX11-CU-LABEL: private_cluster_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -7186,6 +9496,12 @@ define amdgpu_kernel void @private_cluster_seq_cst_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: private_cluster_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -7200,6 +9516,12 @@ define amdgpu_kernel void @private_cluster_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-LABEL: private_cluster_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -7214,26 +9536,32 @@ define amdgpu_kernel void @private_cluster_seq_cst_seq_cst_cmpxchg( ; GFX1250-LABEL: private_cluster_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -7241,6 +9569,7 @@ define amdgpu_kernel void @private_cluster_seq_cst_seq_cst_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -7265,6 +9594,12 @@ define amdgpu_kernel void @private_cluster_monotonic_monotonic_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -7289,6 +9624,12 @@ define amdgpu_kernel void @private_cluster_monotonic_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -7311,6 +9652,12 @@ define amdgpu_kernel void @private_cluster_monotonic_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7330,6 +9677,12 @@ define amdgpu_kernel void @private_cluster_monotonic_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -7354,6 +9707,12 @@ define amdgpu_kernel void @private_cluster_monotonic_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -7376,6 +9735,12 @@ define amdgpu_kernel void @private_cluster_monotonic_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7396,6 +9761,12 @@ define amdgpu_kernel void @private_cluster_monotonic_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7415,6 +9786,12 @@ define amdgpu_kernel void @private_cluster_monotonic_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -7435,6 +9812,12 @@ define amdgpu_kernel void @private_cluster_monotonic_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -7454,6 +9837,12 @@ define amdgpu_kernel void @private_cluster_monotonic_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: private_cluster_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -7470,6 +9859,12 @@ define amdgpu_kernel void @private_cluster_monotonic_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: private_cluster_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -7486,6 +9881,12 @@ define amdgpu_kernel void @private_cluster_monotonic_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: private_cluster_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -7501,6 +9902,12 @@ define amdgpu_kernel void @private_cluster_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: private_cluster_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -7517,25 +9924,31 @@ define amdgpu_kernel void @private_cluster_monotonic_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -7567,6 +9980,12 @@ define amdgpu_kernel void @private_cluster_acquire_monotonic_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -7591,6 +10010,12 @@ define amdgpu_kernel void @private_cluster_acquire_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -7613,6 +10038,12 @@ define amdgpu_kernel void @private_cluster_acquire_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7632,6 +10063,12 @@ define amdgpu_kernel void @private_cluster_acquire_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -7656,6 +10093,12 @@ define amdgpu_kernel void @private_cluster_acquire_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -7678,6 +10121,12 @@ define amdgpu_kernel void @private_cluster_acquire_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7698,6 +10147,12 @@ define amdgpu_kernel void @private_cluster_acquire_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7717,6 +10172,12 @@ define amdgpu_kernel void @private_cluster_acquire_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -7737,6 +10198,12 @@ define amdgpu_kernel void @private_cluster_acquire_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -7756,6 +10223,12 @@ define amdgpu_kernel void @private_cluster_acquire_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: private_cluster_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -7772,6 +10245,12 @@ define amdgpu_kernel void @private_cluster_acquire_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: private_cluster_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -7788,6 +10267,12 @@ define amdgpu_kernel void @private_cluster_acquire_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: private_cluster_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -7803,6 +10288,12 @@ define amdgpu_kernel void @private_cluster_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: private_cluster_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -7819,25 +10310,31 @@ define amdgpu_kernel void @private_cluster_acquire_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -7870,6 +10367,12 @@ define amdgpu_kernel void @private_cluster_release_monotonic_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -7894,6 +10397,12 @@ define amdgpu_kernel void @private_cluster_release_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -7916,6 +10425,12 @@ define amdgpu_kernel void @private_cluster_release_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7935,6 +10450,12 @@ define amdgpu_kernel void @private_cluster_release_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -7959,6 +10480,12 @@ define amdgpu_kernel void @private_cluster_release_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -7981,6 +10508,12 @@ define amdgpu_kernel void @private_cluster_release_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8001,6 +10534,12 @@ define amdgpu_kernel void @private_cluster_release_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8020,6 +10559,12 @@ define amdgpu_kernel void @private_cluster_release_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -8040,6 +10585,12 @@ define amdgpu_kernel void @private_cluster_release_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -8059,6 +10610,12 @@ define amdgpu_kernel void @private_cluster_release_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: private_cluster_release_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -8075,6 +10632,12 @@ define amdgpu_kernel void @private_cluster_release_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: private_cluster_release_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -8091,6 +10654,12 @@ define amdgpu_kernel void @private_cluster_release_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: private_cluster_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -8106,6 +10675,12 @@ define amdgpu_kernel void @private_cluster_release_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: private_cluster_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -8122,25 +10697,31 @@ define amdgpu_kernel void @private_cluster_release_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -8174,6 +10755,12 @@ define amdgpu_kernel void @private_cluster_acq_rel_monotonic_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -8198,6 +10785,12 @@ define amdgpu_kernel void @private_cluster_acq_rel_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -8220,6 +10813,12 @@ define amdgpu_kernel void @private_cluster_acq_rel_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -8239,6 +10838,12 @@ define amdgpu_kernel void @private_cluster_acq_rel_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -8263,6 +10868,12 @@ define amdgpu_kernel void @private_cluster_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -8285,6 +10896,12 @@ define amdgpu_kernel void @private_cluster_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8305,6 +10922,12 @@ define amdgpu_kernel void @private_cluster_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8324,6 +10947,12 @@ define amdgpu_kernel void @private_cluster_acq_rel_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -8344,6 +10973,12 @@ define amdgpu_kernel void @private_cluster_acq_rel_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -8363,6 +10998,12 @@ define amdgpu_kernel void @private_cluster_acq_rel_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: private_cluster_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -8379,6 +11020,12 @@ define amdgpu_kernel void @private_cluster_acq_rel_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: private_cluster_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -8395,6 +11042,12 @@ define amdgpu_kernel void @private_cluster_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: private_cluster_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -8410,6 +11063,12 @@ define amdgpu_kernel void @private_cluster_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: private_cluster_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -8426,25 +11085,31 @@ define amdgpu_kernel void @private_cluster_acq_rel_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -8479,6 +11144,12 @@ define amdgpu_kernel void @private_cluster_seq_cst_monotonic_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -8503,6 +11174,12 @@ define amdgpu_kernel void @private_cluster_seq_cst_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -8525,6 +11202,12 @@ define amdgpu_kernel void @private_cluster_seq_cst_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -8544,6 +11227,12 @@ define amdgpu_kernel void @private_cluster_seq_cst_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -8568,6 +11257,12 @@ define amdgpu_kernel void @private_cluster_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -8590,6 +11285,12 @@ define amdgpu_kernel void @private_cluster_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8610,6 +11311,12 @@ define amdgpu_kernel void @private_cluster_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8629,6 +11336,12 @@ define amdgpu_kernel void @private_cluster_seq_cst_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -8649,6 +11362,12 @@ define amdgpu_kernel void @private_cluster_seq_cst_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -8668,6 +11387,12 @@ define amdgpu_kernel void @private_cluster_seq_cst_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: private_cluster_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -8684,6 +11409,12 @@ define amdgpu_kernel void @private_cluster_seq_cst_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: private_cluster_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -8700,6 +11431,12 @@ define amdgpu_kernel void @private_cluster_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: private_cluster_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -8715,6 +11452,12 @@ define amdgpu_kernel void @private_cluster_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: private_cluster_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -8731,25 +11474,31 @@ define amdgpu_kernel void @private_cluster_seq_cst_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -8784,6 +11533,12 @@ define amdgpu_kernel void @private_cluster_monotonic_acquire_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -8808,6 +11563,12 @@ define amdgpu_kernel void @private_cluster_monotonic_acquire_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -8830,6 +11591,12 @@ define amdgpu_kernel void @private_cluster_monotonic_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -8849,6 +11616,12 @@ define amdgpu_kernel void @private_cluster_monotonic_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -8873,6 +11646,12 @@ define amdgpu_kernel void @private_cluster_monotonic_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -8895,6 +11674,12 @@ define amdgpu_kernel void @private_cluster_monotonic_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8915,6 +11700,12 @@ define amdgpu_kernel void @private_cluster_monotonic_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8934,6 +11725,12 @@ define amdgpu_kernel void @private_cluster_monotonic_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -8954,6 +11751,12 @@ define amdgpu_kernel void @private_cluster_monotonic_acquire_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -8973,6 +11776,12 @@ define amdgpu_kernel void @private_cluster_monotonic_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: private_cluster_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -8989,6 +11798,12 @@ define amdgpu_kernel void @private_cluster_monotonic_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: private_cluster_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -9005,6 +11820,12 @@ define amdgpu_kernel void @private_cluster_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: private_cluster_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -9020,6 +11841,12 @@ define amdgpu_kernel void @private_cluster_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: private_cluster_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -9036,25 +11863,31 @@ define amdgpu_kernel void @private_cluster_monotonic_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -9087,6 +11920,12 @@ define amdgpu_kernel void @private_cluster_acquire_acquire_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -9111,6 +11950,12 @@ define amdgpu_kernel void @private_cluster_acquire_acquire_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -9133,6 +11978,12 @@ define amdgpu_kernel void @private_cluster_acquire_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -9152,6 +12003,12 @@ define amdgpu_kernel void @private_cluster_acquire_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -9176,6 +12033,12 @@ define amdgpu_kernel void @private_cluster_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -9198,6 +12061,12 @@ define amdgpu_kernel void @private_cluster_acquire_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9218,6 +12087,12 @@ define amdgpu_kernel void @private_cluster_acquire_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9237,6 +12112,12 @@ define amdgpu_kernel void @private_cluster_acquire_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -9257,6 +12138,12 @@ define amdgpu_kernel void @private_cluster_acquire_acquire_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -9276,6 +12163,12 @@ define amdgpu_kernel void @private_cluster_acquire_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: private_cluster_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -9292,6 +12185,12 @@ define amdgpu_kernel void @private_cluster_acquire_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: private_cluster_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -9308,6 +12207,12 @@ define amdgpu_kernel void @private_cluster_acquire_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: private_cluster_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -9323,6 +12228,12 @@ define amdgpu_kernel void @private_cluster_acquire_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: private_cluster_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -9339,25 +12250,31 @@ define amdgpu_kernel void @private_cluster_acquire_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -9390,6 +12307,12 @@ define amdgpu_kernel void @private_cluster_release_acquire_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -9414,6 +12337,12 @@ define amdgpu_kernel void @private_cluster_release_acquire_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -9436,6 +12365,12 @@ define amdgpu_kernel void @private_cluster_release_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -9455,6 +12390,12 @@ define amdgpu_kernel void @private_cluster_release_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -9479,6 +12420,12 @@ define amdgpu_kernel void @private_cluster_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -9501,6 +12448,12 @@ define amdgpu_kernel void @private_cluster_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9521,6 +12474,12 @@ define amdgpu_kernel void @private_cluster_release_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9540,6 +12499,12 @@ define amdgpu_kernel void @private_cluster_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -9560,6 +12525,12 @@ define amdgpu_kernel void @private_cluster_release_acquire_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -9579,6 +12550,12 @@ define amdgpu_kernel void @private_cluster_release_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: private_cluster_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -9595,6 +12572,12 @@ define amdgpu_kernel void @private_cluster_release_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: private_cluster_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -9611,6 +12594,12 @@ define amdgpu_kernel void @private_cluster_release_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: private_cluster_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -9626,6 +12615,12 @@ define amdgpu_kernel void @private_cluster_release_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: private_cluster_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -9642,25 +12637,31 @@ define amdgpu_kernel void @private_cluster_release_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -9695,6 +12696,12 @@ define amdgpu_kernel void @private_cluster_acq_rel_acquire_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -9719,6 +12726,12 @@ define amdgpu_kernel void @private_cluster_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -9741,6 +12754,12 @@ define amdgpu_kernel void @private_cluster_acq_rel_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -9760,6 +12779,12 @@ define amdgpu_kernel void @private_cluster_acq_rel_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -9784,6 +12809,12 @@ define amdgpu_kernel void @private_cluster_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -9806,6 +12837,12 @@ define amdgpu_kernel void @private_cluster_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9826,6 +12863,12 @@ define amdgpu_kernel void @private_cluster_acq_rel_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9845,6 +12888,12 @@ define amdgpu_kernel void @private_cluster_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -9865,6 +12914,12 @@ define amdgpu_kernel void @private_cluster_acq_rel_acquire_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -9884,6 +12939,12 @@ define amdgpu_kernel void @private_cluster_acq_rel_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: private_cluster_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -9900,6 +12961,12 @@ define amdgpu_kernel void @private_cluster_acq_rel_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: private_cluster_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -9916,6 +12983,12 @@ define amdgpu_kernel void @private_cluster_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: private_cluster_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -9931,6 +13004,12 @@ define amdgpu_kernel void @private_cluster_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: private_cluster_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -9947,25 +13026,31 @@ define amdgpu_kernel void @private_cluster_acq_rel_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -10000,6 +13085,12 @@ define amdgpu_kernel void @private_cluster_seq_cst_acquire_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -10024,6 +13115,12 @@ define amdgpu_kernel void @private_cluster_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -10046,6 +13143,12 @@ define amdgpu_kernel void @private_cluster_seq_cst_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10065,6 +13168,12 @@ define amdgpu_kernel void @private_cluster_seq_cst_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10089,6 +13198,12 @@ define amdgpu_kernel void @private_cluster_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -10111,6 +13226,12 @@ define amdgpu_kernel void @private_cluster_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10131,6 +13252,12 @@ define amdgpu_kernel void @private_cluster_seq_cst_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10150,6 +13277,12 @@ define amdgpu_kernel void @private_cluster_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -10170,6 +13303,12 @@ define amdgpu_kernel void @private_cluster_seq_cst_acquire_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -10189,6 +13328,12 @@ define amdgpu_kernel void @private_cluster_seq_cst_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: private_cluster_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -10205,6 +13350,12 @@ define amdgpu_kernel void @private_cluster_seq_cst_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: private_cluster_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -10221,6 +13372,12 @@ define amdgpu_kernel void @private_cluster_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: private_cluster_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -10236,6 +13393,12 @@ define amdgpu_kernel void @private_cluster_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: private_cluster_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -10252,25 +13415,31 @@ define amdgpu_kernel void @private_cluster_seq_cst_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -10305,6 +13474,12 @@ define amdgpu_kernel void @private_cluster_monotonic_seq_cst_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -10329,6 +13504,12 @@ define amdgpu_kernel void @private_cluster_monotonic_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -10351,6 +13532,12 @@ define amdgpu_kernel void @private_cluster_monotonic_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10370,6 +13557,12 @@ define amdgpu_kernel void @private_cluster_monotonic_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10394,6 +13587,12 @@ define amdgpu_kernel void @private_cluster_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -10416,6 +13615,12 @@ define amdgpu_kernel void @private_cluster_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10436,6 +13641,12 @@ define amdgpu_kernel void @private_cluster_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10455,6 +13666,12 @@ define amdgpu_kernel void @private_cluster_monotonic_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -10475,6 +13692,12 @@ define amdgpu_kernel void @private_cluster_monotonic_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -10494,6 +13717,12 @@ define amdgpu_kernel void @private_cluster_monotonic_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: private_cluster_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -10510,6 +13739,12 @@ define amdgpu_kernel void @private_cluster_monotonic_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: private_cluster_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -10526,6 +13761,12 @@ define amdgpu_kernel void @private_cluster_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: private_cluster_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -10541,6 +13782,12 @@ define amdgpu_kernel void @private_cluster_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: private_cluster_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -10557,25 +13804,31 @@ define amdgpu_kernel void @private_cluster_monotonic_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -10610,6 +13863,12 @@ define amdgpu_kernel void @private_cluster_acquire_seq_cst_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -10634,6 +13893,12 @@ define amdgpu_kernel void @private_cluster_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -10656,6 +13921,12 @@ define amdgpu_kernel void @private_cluster_acquire_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10675,6 +13946,12 @@ define amdgpu_kernel void @private_cluster_acquire_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10699,6 +13976,12 @@ define amdgpu_kernel void @private_cluster_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -10721,6 +14004,12 @@ define amdgpu_kernel void @private_cluster_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10741,6 +14030,12 @@ define amdgpu_kernel void @private_cluster_acquire_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10760,6 +14055,12 @@ define amdgpu_kernel void @private_cluster_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -10780,6 +14081,12 @@ define amdgpu_kernel void @private_cluster_acquire_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -10799,6 +14106,12 @@ define amdgpu_kernel void @private_cluster_acquire_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: private_cluster_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -10815,6 +14128,12 @@ define amdgpu_kernel void @private_cluster_acquire_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: private_cluster_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -10831,6 +14150,12 @@ define amdgpu_kernel void @private_cluster_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: private_cluster_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -10846,6 +14171,12 @@ define amdgpu_kernel void @private_cluster_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: private_cluster_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -10862,25 +14193,31 @@ define amdgpu_kernel void @private_cluster_acquire_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -10915,6 +14252,12 @@ define amdgpu_kernel void @private_cluster_release_seq_cst_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -10939,6 +14282,12 @@ define amdgpu_kernel void @private_cluster_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -10961,6 +14310,12 @@ define amdgpu_kernel void @private_cluster_release_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10980,6 +14335,12 @@ define amdgpu_kernel void @private_cluster_release_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11004,6 +14365,12 @@ define amdgpu_kernel void @private_cluster_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -11026,6 +14393,12 @@ define amdgpu_kernel void @private_cluster_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11046,6 +14419,12 @@ define amdgpu_kernel void @private_cluster_release_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11065,6 +14444,12 @@ define amdgpu_kernel void @private_cluster_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -11085,6 +14470,12 @@ define amdgpu_kernel void @private_cluster_release_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -11104,6 +14495,12 @@ define amdgpu_kernel void @private_cluster_release_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: private_cluster_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -11120,6 +14517,12 @@ define amdgpu_kernel void @private_cluster_release_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: private_cluster_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -11136,6 +14539,12 @@ define amdgpu_kernel void @private_cluster_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: private_cluster_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -11151,6 +14560,12 @@ define amdgpu_kernel void @private_cluster_release_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: private_cluster_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -11167,25 +14582,31 @@ define amdgpu_kernel void @private_cluster_release_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -11220,6 +14641,12 @@ define amdgpu_kernel void @private_cluster_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -11244,6 +14671,12 @@ define amdgpu_kernel void @private_cluster_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -11266,6 +14699,12 @@ define amdgpu_kernel void @private_cluster_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11285,6 +14724,12 @@ define amdgpu_kernel void @private_cluster_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11309,6 +14754,12 @@ define amdgpu_kernel void @private_cluster_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -11331,6 +14782,12 @@ define amdgpu_kernel void @private_cluster_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11351,6 +14808,12 @@ define amdgpu_kernel void @private_cluster_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11370,6 +14833,12 @@ define amdgpu_kernel void @private_cluster_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -11390,6 +14859,12 @@ define amdgpu_kernel void @private_cluster_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -11409,6 +14884,12 @@ define amdgpu_kernel void @private_cluster_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: private_cluster_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -11425,6 +14906,12 @@ define amdgpu_kernel void @private_cluster_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: private_cluster_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -11441,6 +14928,12 @@ define amdgpu_kernel void @private_cluster_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: private_cluster_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -11456,6 +14949,12 @@ define amdgpu_kernel void @private_cluster_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: private_cluster_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -11472,25 +14971,31 @@ define amdgpu_kernel void @private_cluster_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -11525,6 +15030,12 @@ define amdgpu_kernel void @private_cluster_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -11549,6 +15060,12 @@ define amdgpu_kernel void @private_cluster_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -11571,6 +15088,12 @@ define amdgpu_kernel void @private_cluster_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11590,6 +15113,12 @@ define amdgpu_kernel void @private_cluster_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11614,6 +15143,12 @@ define amdgpu_kernel void @private_cluster_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -11636,6 +15171,12 @@ define amdgpu_kernel void @private_cluster_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11656,6 +15197,12 @@ define amdgpu_kernel void @private_cluster_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11675,6 +15222,12 @@ define amdgpu_kernel void @private_cluster_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -11695,6 +15248,12 @@ define amdgpu_kernel void @private_cluster_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -11714,6 +15273,12 @@ define amdgpu_kernel void @private_cluster_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: private_cluster_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -11730,6 +15295,12 @@ define amdgpu_kernel void @private_cluster_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: private_cluster_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -11746,6 +15317,12 @@ define amdgpu_kernel void @private_cluster_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: private_cluster_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -11761,6 +15338,12 @@ define amdgpu_kernel void @private_cluster_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: private_cluster_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -11777,25 +15360,31 @@ define amdgpu_kernel void @private_cluster_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -11828,11 +15417,15 @@ define amdgpu_kernel void @private_cluster_one_as_unordered_load( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -11842,11 +15435,15 @@ define amdgpu_kernel void @private_cluster_one_as_unordered_load( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -11856,11 +15453,15 @@ define amdgpu_kernel void @private_cluster_one_as_unordered_load( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -11870,11 +15471,15 @@ define amdgpu_kernel void @private_cluster_one_as_unordered_load( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -11888,11 +15493,15 @@ define amdgpu_kernel void @private_cluster_one_as_unordered_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -11902,11 +15511,15 @@ define amdgpu_kernel void @private_cluster_one_as_unordered_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -11916,11 +15529,15 @@ define amdgpu_kernel void @private_cluster_one_as_unordered_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -11928,84 +15545,108 @@ define amdgpu_kernel void @private_cluster_one_as_unordered_load( ; ; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_unordered_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_cluster_one_as_unordered_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_cluster_one_as_unordered_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_cluster_one_as_unordered_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_cluster_one_as_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_cluster_one_as_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: private_cluster_one_as_unordered_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -12014,6 +15655,7 @@ define amdgpu_kernel void @private_cluster_one_as_unordered_load( ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: flat_load_b32 v0, v[0:1] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %in, ptr addrspace(5) %out) { @@ -12028,11 +15670,15 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_load( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12042,11 +15688,15 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_load( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12056,11 +15706,15 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_load( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12070,11 +15724,15 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_load( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12088,11 +15746,15 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -12102,11 +15764,15 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12116,11 +15782,15 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12128,84 +15798,108 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_load( ; ; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_monotonic_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_cluster_one_as_monotonic_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_cluster_one_as_monotonic_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_cluster_one_as_monotonic_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_cluster_one_as_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_cluster_one_as_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: private_cluster_one_as_monotonic_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -12214,6 +15908,7 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_load( ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_SE ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %in, ptr addrspace(5) %out) { @@ -12228,11 +15923,15 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_load( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12242,11 +15941,15 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_load( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12256,11 +15959,15 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_load( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12270,11 +15977,15 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_load( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12288,11 +15999,15 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -12302,11 +16017,15 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12316,11 +16035,15 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12328,84 +16051,108 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_load( ; ; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_acquire_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_cluster_one_as_acquire_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_cluster_one_as_acquire_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_cluster_one_as_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_cluster_one_as_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_cluster_one_as_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: private_cluster_one_as_acquire_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -12416,6 +16163,7 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_load( ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SE ; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %in, ptr addrspace(5) %out) { @@ -12430,11 +16178,15 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_load( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12444,11 +16196,15 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_load( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12458,11 +16214,15 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_load( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12472,11 +16232,15 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_load( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12490,11 +16254,15 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -12504,11 +16272,15 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12518,11 +16290,15 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12530,84 +16306,108 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_load( ; ; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_seq_cst_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_cluster_one_as_seq_cst_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_cluster_one_as_seq_cst_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_cluster_one_as_seq_cst_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_cluster_one_as_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_cluster_one_as_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: private_cluster_one_as_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -12620,6 +16420,7 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_load( ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SE ; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %in, ptr addrspace(5) %out) { @@ -12634,10 +16435,14 @@ define amdgpu_kernel void @private_cluster_one_as_unordered_store( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX6-NEXT: s_endpgm @@ -12646,10 +16451,14 @@ define amdgpu_kernel void @private_cluster_one_as_unordered_store( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX7-NEXT: s_endpgm @@ -12658,10 +16467,14 @@ define amdgpu_kernel void @private_cluster_one_as_unordered_store( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-WGP-NEXT: s_endpgm @@ -12670,10 +16483,14 @@ define amdgpu_kernel void @private_cluster_one_as_unordered_store( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-CU-NEXT: s_endpgm @@ -12686,10 +16503,14 @@ define amdgpu_kernel void @private_cluster_one_as_unordered_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -12698,10 +16519,14 @@ define amdgpu_kernel void @private_cluster_one_as_unordered_store( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -12710,65 +16535,93 @@ define amdgpu_kernel void @private_cluster_one_as_unordered_store( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_unordered_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_cluster_one_as_unordered_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_cluster_one_as_unordered_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_cluster_one_as_unordered_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_cluster_one_as_unordered_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_cluster_one_as_unordered_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; @@ -12776,22 +16629,26 @@ define amdgpu_kernel void @private_cluster_one_as_unordered_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -12813,10 +16670,14 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_store( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX6-NEXT: s_endpgm @@ -12825,10 +16686,14 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_store( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX7-NEXT: s_endpgm @@ -12837,10 +16702,14 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_store( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-WGP-NEXT: s_endpgm @@ -12849,10 +16718,14 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_store( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-CU-NEXT: s_endpgm @@ -12865,10 +16738,14 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -12877,10 +16754,14 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_store( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -12889,65 +16770,93 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_store( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_monotonic_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_cluster_one_as_monotonic_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_cluster_one_as_monotonic_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_cluster_one_as_monotonic_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_cluster_one_as_monotonic_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_cluster_one_as_monotonic_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; @@ -12955,22 +16864,26 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -12992,10 +16905,14 @@ define amdgpu_kernel void @private_cluster_one_as_release_store( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX6-NEXT: s_endpgm @@ -13004,10 +16921,14 @@ define amdgpu_kernel void @private_cluster_one_as_release_store( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX7-NEXT: s_endpgm @@ -13016,10 +16937,14 @@ define amdgpu_kernel void @private_cluster_one_as_release_store( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-WGP-NEXT: s_endpgm @@ -13028,10 +16953,14 @@ define amdgpu_kernel void @private_cluster_one_as_release_store( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-CU-NEXT: s_endpgm @@ -13044,10 +16973,14 @@ define amdgpu_kernel void @private_cluster_one_as_release_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -13056,10 +16989,14 @@ define amdgpu_kernel void @private_cluster_one_as_release_store( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -13068,65 +17005,93 @@ define amdgpu_kernel void @private_cluster_one_as_release_store( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_release_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_cluster_one_as_release_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_cluster_one_as_release_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_cluster_one_as_release_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_cluster_one_as_release_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_cluster_one_as_release_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; @@ -13134,22 +17099,26 @@ define amdgpu_kernel void @private_cluster_one_as_release_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -13173,10 +17142,14 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_store( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX6-NEXT: s_endpgm @@ -13185,10 +17158,14 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_store( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX7-NEXT: s_endpgm @@ -13197,10 +17174,14 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_store( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-WGP-NEXT: s_endpgm @@ -13209,10 +17190,14 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_store( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-CU-NEXT: s_endpgm @@ -13225,10 +17210,14 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -13237,10 +17226,14 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_store( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -13249,65 +17242,93 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_store( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_seq_cst_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_cluster_one_as_seq_cst_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_cluster_one_as_seq_cst_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_cluster_one_as_seq_cst_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_cluster_one_as_seq_cst_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_cluster_one_as_seq_cst_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; @@ -13315,22 +17336,26 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -13355,8 +17380,15 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13367,8 +17399,15 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13379,8 +17418,15 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13391,8 +17437,15 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13407,8 +17460,15 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -13419,8 +17479,15 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13431,8 +17498,15 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13441,8 +17515,14 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_monotonic_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -13450,8 +17530,14 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_cluster_one_as_monotonic_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm @@ -13459,8 +17545,14 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_atomicrmw( ; GFX11-WGP-LABEL: private_cluster_one_as_monotonic_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm @@ -13468,8 +17560,14 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_atomicrmw( ; GFX11-CU-LABEL: private_cluster_one_as_monotonic_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm @@ -13477,8 +17575,14 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_atomicrmw( ; GFX12-WGP-LABEL: private_cluster_one_as_monotonic_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm @@ -13486,8 +17590,14 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_atomicrmw( ; GFX12-CU-LABEL: private_cluster_one_as_monotonic_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm @@ -13495,29 +17605,34 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_atomicrmw( ; GFX1250-LABEL: private_cluster_one_as_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 ; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SE @@ -13534,8 +17649,15 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13546,8 +17668,15 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13558,8 +17687,15 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13570,8 +17706,15 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13586,8 +17729,15 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -13598,8 +17748,15 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13610,8 +17767,15 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13620,8 +17784,14 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_acquire_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -13629,8 +17799,14 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_cluster_one_as_acquire_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm @@ -13638,8 +17814,14 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_atomicrmw( ; GFX11-WGP-LABEL: private_cluster_one_as_acquire_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm @@ -13647,8 +17829,14 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_atomicrmw( ; GFX11-CU-LABEL: private_cluster_one_as_acquire_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm @@ -13656,8 +17844,14 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_atomicrmw( ; GFX12-WGP-LABEL: private_cluster_one_as_acquire_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm @@ -13665,8 +17859,14 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_atomicrmw( ; GFX12-CU-LABEL: private_cluster_one_as_acquire_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm @@ -13674,29 +17874,34 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_atomicrmw( ; GFX1250-LABEL: private_cluster_one_as_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 ; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SE @@ -13715,8 +17920,15 @@ define amdgpu_kernel void @private_cluster_one_as_release_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13727,8 +17939,15 @@ define amdgpu_kernel void @private_cluster_one_as_release_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13739,8 +17958,15 @@ define amdgpu_kernel void @private_cluster_one_as_release_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13751,8 +17977,15 @@ define amdgpu_kernel void @private_cluster_one_as_release_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13767,8 +18000,15 @@ define amdgpu_kernel void @private_cluster_one_as_release_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -13779,8 +18019,15 @@ define amdgpu_kernel void @private_cluster_one_as_release_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13791,8 +18038,15 @@ define amdgpu_kernel void @private_cluster_one_as_release_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13801,8 +18055,14 @@ define amdgpu_kernel void @private_cluster_one_as_release_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_release_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -13810,8 +18070,14 @@ define amdgpu_kernel void @private_cluster_one_as_release_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_cluster_one_as_release_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm @@ -13819,8 +18085,14 @@ define amdgpu_kernel void @private_cluster_one_as_release_atomicrmw( ; GFX11-WGP-LABEL: private_cluster_one_as_release_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm @@ -13828,8 +18100,14 @@ define amdgpu_kernel void @private_cluster_one_as_release_atomicrmw( ; GFX11-CU-LABEL: private_cluster_one_as_release_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm @@ -13837,8 +18115,14 @@ define amdgpu_kernel void @private_cluster_one_as_release_atomicrmw( ; GFX12-WGP-LABEL: private_cluster_one_as_release_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm @@ -13846,8 +18130,14 @@ define amdgpu_kernel void @private_cluster_one_as_release_atomicrmw( ; GFX12-CU-LABEL: private_cluster_one_as_release_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm @@ -13855,29 +18145,34 @@ define amdgpu_kernel void @private_cluster_one_as_release_atomicrmw( ; GFX1250-LABEL: private_cluster_one_as_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 ; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -13896,8 +18191,15 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13908,8 +18210,15 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13920,8 +18229,15 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13932,8 +18248,15 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13948,8 +18271,15 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -13960,8 +18290,15 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13972,8 +18309,15 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13982,8 +18326,14 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_acq_rel_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -13991,8 +18341,14 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_cluster_one_as_acq_rel_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm @@ -14000,8 +18356,14 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_atomicrmw( ; GFX11-WGP-LABEL: private_cluster_one_as_acq_rel_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm @@ -14009,8 +18371,14 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_atomicrmw( ; GFX11-CU-LABEL: private_cluster_one_as_acq_rel_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm @@ -14018,8 +18386,14 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_atomicrmw( ; GFX12-WGP-LABEL: private_cluster_one_as_acq_rel_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm @@ -14027,8 +18401,14 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_atomicrmw( ; GFX12-CU-LABEL: private_cluster_one_as_acq_rel_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm @@ -14036,29 +18416,34 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_atomicrmw( ; GFX1250-LABEL: private_cluster_one_as_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 ; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -14079,8 +18464,15 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -14091,8 +18483,15 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -14103,8 +18502,15 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -14115,8 +18521,15 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -14131,8 +18544,15 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -14143,8 +18563,15 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -14155,8 +18582,15 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -14165,8 +18599,14 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_seq_cst_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -14174,8 +18614,14 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_cluster_one_as_seq_cst_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm @@ -14183,8 +18629,14 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_atomicrmw( ; GFX11-WGP-LABEL: private_cluster_one_as_seq_cst_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm @@ -14192,8 +18644,14 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_atomicrmw( ; GFX11-CU-LABEL: private_cluster_one_as_seq_cst_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm @@ -14201,8 +18659,14 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_atomicrmw( ; GFX12-WGP-LABEL: private_cluster_one_as_seq_cst_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm @@ -14210,8 +18674,14 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_atomicrmw( ; GFX12-CU-LABEL: private_cluster_one_as_seq_cst_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm @@ -14219,29 +18689,34 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_atomicrmw( ; GFX1250-LABEL: private_cluster_one_as_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 ; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -14262,6 +18737,10 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_ret_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -14280,6 +18759,10 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_ret_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -14297,6 +18780,10 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_ret_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -14314,6 +18801,10 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_ret_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -14335,6 +18826,10 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -14352,6 +18847,10 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -14369,6 +18868,10 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -14384,6 +18887,10 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_acquire_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 @@ -14396,6 +18903,10 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_cluster_one_as_acquire_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 @@ -14408,6 +18919,10 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_ret_atomicrmw( ; GFX11-WGP-LABEL: private_cluster_one_as_acquire_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 @@ -14420,6 +18935,10 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_ret_atomicrmw( ; GFX11-CU-LABEL: private_cluster_one_as_acquire_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 @@ -14432,6 +18951,10 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_ret_atomicrmw( ; GFX12-WGP-LABEL: private_cluster_one_as_acquire_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 @@ -14444,6 +18967,10 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_ret_atomicrmw( ; GFX12-CU-LABEL: private_cluster_one_as_acquire_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 @@ -14457,22 +18984,26 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_ret_atomicrmw( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_lg_u32 s0, s2 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b32 s2, 0 ; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s0, s3 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -14500,6 +19031,10 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_ret_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -14518,6 +19053,10 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_ret_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -14535,6 +19074,10 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_ret_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -14552,6 +19095,10 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_ret_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -14573,6 +19120,10 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -14590,6 +19141,10 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -14607,6 +19162,10 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -14622,6 +19181,10 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_acq_rel_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 @@ -14634,6 +19197,10 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_cluster_one_as_acq_rel_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 @@ -14646,6 +19213,10 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_ret_atomicrmw( ; GFX11-WGP-LABEL: private_cluster_one_as_acq_rel_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 @@ -14658,6 +19229,10 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_ret_atomicrmw( ; GFX11-CU-LABEL: private_cluster_one_as_acq_rel_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 @@ -14670,6 +19245,10 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_ret_atomicrmw( ; GFX12-WGP-LABEL: private_cluster_one_as_acq_rel_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 @@ -14682,6 +19261,10 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-LABEL: private_cluster_one_as_acq_rel_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 @@ -14695,22 +19278,26 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_ret_atomicrmw( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_lg_u32 s0, s2 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b32 s2, 0 ; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s0, s3 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -14740,6 +19327,10 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_ret_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -14758,6 +19349,10 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_ret_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -14775,6 +19370,10 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_ret_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -14792,6 +19391,10 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_ret_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -14813,6 +19416,10 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -14830,6 +19437,10 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -14847,6 +19458,10 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -14862,6 +19477,10 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_seq_cst_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 @@ -14874,6 +19493,10 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_cluster_one_as_seq_cst_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 @@ -14886,6 +19509,10 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_ret_atomicrmw( ; GFX11-WGP-LABEL: private_cluster_one_as_seq_cst_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 @@ -14898,6 +19525,10 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_ret_atomicrmw( ; GFX11-CU-LABEL: private_cluster_one_as_seq_cst_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 @@ -14910,6 +19541,10 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_ret_atomicrmw( ; GFX12-WGP-LABEL: private_cluster_one_as_seq_cst_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 @@ -14922,6 +19557,10 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-LABEL: private_cluster_one_as_seq_cst_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 @@ -14935,22 +19574,26 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_ret_atomicrmw( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_lg_u32 s0, s2 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b32 s2, 0 ; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s0, s3 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -14980,6 +19623,12 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_monotonic_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -15000,6 +19649,12 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_monotonic_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -15020,6 +19675,12 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -15037,6 +19698,12 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -15058,6 +19725,12 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -15078,6 +19751,12 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15096,6 +19775,12 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15112,6 +19797,12 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_monotonic_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -15129,6 +19820,12 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_cluster_one_as_monotonic_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -15146,6 +19843,12 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_monotonic_cmpxchg( ; GFX11-WGP-LABEL: private_cluster_one_as_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -15161,6 +19864,12 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_monotonic_cmpxchg( ; GFX11-CU-LABEL: private_cluster_one_as_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -15176,6 +19885,12 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_monotonic_cmpxchg( ; GFX12-WGP-LABEL: private_cluster_one_as_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -15190,6 +19905,12 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-LABEL: private_cluster_one_as_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -15204,26 +19925,32 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_monotonic_cmpxchg( ; GFX1250-LABEL: private_cluster_one_as_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -15231,6 +19958,7 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_monotonic_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -15250,6 +19978,12 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_monotonic_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -15270,6 +20004,12 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_monotonic_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -15290,6 +20030,12 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -15307,6 +20053,12 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -15328,6 +20080,12 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -15348,6 +20106,12 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15366,6 +20130,12 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15382,6 +20152,12 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_acquire_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -15399,6 +20175,12 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_cluster_one_as_acquire_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -15416,6 +20198,12 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_monotonic_cmpxchg( ; GFX11-WGP-LABEL: private_cluster_one_as_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -15431,6 +20219,12 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_monotonic_cmpxchg( ; GFX11-CU-LABEL: private_cluster_one_as_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -15446,6 +20240,12 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_monotonic_cmpxchg( ; GFX12-WGP-LABEL: private_cluster_one_as_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -15460,6 +20260,12 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-LABEL: private_cluster_one_as_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -15474,26 +20280,32 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_monotonic_cmpxchg( ; GFX1250-LABEL: private_cluster_one_as_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -15501,6 +20313,7 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_monotonic_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -15522,6 +20335,12 @@ define amdgpu_kernel void @private_cluster_one_as_release_monotonic_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -15542,6 +20361,12 @@ define amdgpu_kernel void @private_cluster_one_as_release_monotonic_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -15562,6 +20387,12 @@ define amdgpu_kernel void @private_cluster_one_as_release_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -15579,6 +20410,12 @@ define amdgpu_kernel void @private_cluster_one_as_release_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -15600,6 +20437,12 @@ define amdgpu_kernel void @private_cluster_one_as_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -15620,6 +20463,12 @@ define amdgpu_kernel void @private_cluster_one_as_release_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15638,6 +20487,12 @@ define amdgpu_kernel void @private_cluster_one_as_release_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15654,6 +20509,12 @@ define amdgpu_kernel void @private_cluster_one_as_release_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_release_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -15671,6 +20532,12 @@ define amdgpu_kernel void @private_cluster_one_as_release_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_cluster_one_as_release_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -15688,6 +20555,12 @@ define amdgpu_kernel void @private_cluster_one_as_release_monotonic_cmpxchg( ; GFX11-WGP-LABEL: private_cluster_one_as_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -15703,6 +20576,12 @@ define amdgpu_kernel void @private_cluster_one_as_release_monotonic_cmpxchg( ; GFX11-CU-LABEL: private_cluster_one_as_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -15718,6 +20597,12 @@ define amdgpu_kernel void @private_cluster_one_as_release_monotonic_cmpxchg( ; GFX12-WGP-LABEL: private_cluster_one_as_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -15732,6 +20617,12 @@ define amdgpu_kernel void @private_cluster_one_as_release_monotonic_cmpxchg( ; GFX12-CU-LABEL: private_cluster_one_as_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -15746,26 +20637,32 @@ define amdgpu_kernel void @private_cluster_one_as_release_monotonic_cmpxchg( ; GFX1250-LABEL: private_cluster_one_as_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -15773,6 +20670,7 @@ define amdgpu_kernel void @private_cluster_one_as_release_monotonic_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -15794,6 +20692,12 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_monotonic_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -15814,6 +20718,12 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -15834,6 +20744,12 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -15851,6 +20767,12 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -15872,6 +20794,12 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -15892,6 +20820,12 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15910,6 +20844,12 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15926,6 +20866,12 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_acq_rel_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -15943,6 +20889,12 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_cluster_one_as_acq_rel_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -15960,6 +20912,12 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_monotonic_cmpxchg( ; GFX11-WGP-LABEL: private_cluster_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -15975,6 +20933,12 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_monotonic_cmpxchg( ; GFX11-CU-LABEL: private_cluster_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -15990,6 +20954,12 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-WGP-LABEL: private_cluster_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -16004,6 +20974,12 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-LABEL: private_cluster_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -16018,26 +20994,32 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_monotonic_cmpxchg( ; GFX1250-LABEL: private_cluster_one_as_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -16045,6 +21027,7 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_monotonic_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -16068,6 +21051,12 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_monotonic_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -16088,6 +21077,12 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -16108,6 +21103,12 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16125,6 +21126,12 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16146,6 +21153,12 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -16166,6 +21179,12 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16184,6 +21203,12 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16200,6 +21225,12 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_seq_cst_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -16217,6 +21248,12 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_cluster_one_as_seq_cst_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -16234,6 +21271,12 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_monotonic_cmpxchg( ; GFX11-WGP-LABEL: private_cluster_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -16249,6 +21292,12 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_monotonic_cmpxchg( ; GFX11-CU-LABEL: private_cluster_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -16264,6 +21313,12 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-WGP-LABEL: private_cluster_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -16278,6 +21333,12 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-LABEL: private_cluster_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -16292,26 +21353,32 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_monotonic_cmpxchg( ; GFX1250-LABEL: private_cluster_one_as_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -16319,6 +21386,7 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_monotonic_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -16342,6 +21410,12 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_acquire_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -16362,6 +21436,12 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_acquire_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -16382,6 +21462,12 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16399,6 +21485,12 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_acquire_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16420,6 +21512,12 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -16440,6 +21538,12 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16458,6 +21562,12 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16474,6 +21584,12 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_monotonic_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -16491,6 +21607,12 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_cluster_one_as_monotonic_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -16508,6 +21630,12 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_acquire_cmpxchg( ; GFX11-WGP-LABEL: private_cluster_one_as_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -16523,6 +21651,12 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_acquire_cmpxchg( ; GFX11-CU-LABEL: private_cluster_one_as_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -16538,6 +21672,12 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_acquire_cmpxchg( ; GFX12-WGP-LABEL: private_cluster_one_as_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -16552,6 +21692,12 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-LABEL: private_cluster_one_as_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -16566,26 +21712,32 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_acquire_cmpxchg( ; GFX1250-LABEL: private_cluster_one_as_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -16593,6 +21745,7 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_acquire_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -16614,6 +21767,12 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_acquire_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -16634,6 +21793,12 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_acquire_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -16654,6 +21819,12 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16671,6 +21842,12 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_acquire_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16692,6 +21869,12 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -16712,6 +21895,12 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16730,6 +21919,12 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16746,6 +21941,12 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_acquire_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -16763,6 +21964,12 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_cluster_one_as_acquire_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -16780,6 +21987,12 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_acquire_cmpxchg( ; GFX11-WGP-LABEL: private_cluster_one_as_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -16795,6 +22008,12 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_acquire_cmpxchg( ; GFX11-CU-LABEL: private_cluster_one_as_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -16810,6 +22029,12 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_acquire_cmpxchg( ; GFX12-WGP-LABEL: private_cluster_one_as_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -16824,6 +22049,12 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-LABEL: private_cluster_one_as_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -16838,26 +22069,32 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_acquire_cmpxchg( ; GFX1250-LABEL: private_cluster_one_as_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -16865,6 +22102,7 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_acquire_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -16886,6 +22124,12 @@ define amdgpu_kernel void @private_cluster_one_as_release_acquire_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -16906,6 +22150,12 @@ define amdgpu_kernel void @private_cluster_one_as_release_acquire_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -16926,6 +22176,12 @@ define amdgpu_kernel void @private_cluster_one_as_release_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16943,6 +22199,12 @@ define amdgpu_kernel void @private_cluster_one_as_release_acquire_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16964,6 +22226,12 @@ define amdgpu_kernel void @private_cluster_one_as_release_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -16984,6 +22252,12 @@ define amdgpu_kernel void @private_cluster_one_as_release_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17002,6 +22276,12 @@ define amdgpu_kernel void @private_cluster_one_as_release_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17018,6 +22298,12 @@ define amdgpu_kernel void @private_cluster_one_as_release_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_release_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -17035,6 +22321,12 @@ define amdgpu_kernel void @private_cluster_one_as_release_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_cluster_one_as_release_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -17052,6 +22344,12 @@ define amdgpu_kernel void @private_cluster_one_as_release_acquire_cmpxchg( ; GFX11-WGP-LABEL: private_cluster_one_as_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -17067,6 +22365,12 @@ define amdgpu_kernel void @private_cluster_one_as_release_acquire_cmpxchg( ; GFX11-CU-LABEL: private_cluster_one_as_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -17082,6 +22386,12 @@ define amdgpu_kernel void @private_cluster_one_as_release_acquire_cmpxchg( ; GFX12-WGP-LABEL: private_cluster_one_as_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -17096,6 +22406,12 @@ define amdgpu_kernel void @private_cluster_one_as_release_acquire_cmpxchg( ; GFX12-CU-LABEL: private_cluster_one_as_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -17110,26 +22426,32 @@ define amdgpu_kernel void @private_cluster_one_as_release_acquire_cmpxchg( ; GFX1250-LABEL: private_cluster_one_as_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -17137,6 +22459,7 @@ define amdgpu_kernel void @private_cluster_one_as_release_acquire_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -17160,6 +22483,12 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_acquire_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -17180,6 +22509,12 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -17200,6 +22535,12 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17217,6 +22558,12 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_acquire_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17238,6 +22585,12 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -17258,6 +22611,12 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17276,6 +22635,12 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17292,6 +22657,12 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_acq_rel_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -17309,6 +22680,12 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_cluster_one_as_acq_rel_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -17326,6 +22703,12 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_acquire_cmpxchg( ; GFX11-WGP-LABEL: private_cluster_one_as_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -17341,6 +22724,12 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_acquire_cmpxchg( ; GFX11-CU-LABEL: private_cluster_one_as_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -17356,6 +22745,12 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_acquire_cmpxchg( ; GFX12-WGP-LABEL: private_cluster_one_as_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -17370,6 +22765,12 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-LABEL: private_cluster_one_as_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -17384,26 +22785,32 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_acquire_cmpxchg( ; GFX1250-LABEL: private_cluster_one_as_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -17411,6 +22818,7 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_acquire_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -17434,6 +22842,12 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_acquire_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -17454,6 +22868,12 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -17474,6 +22894,12 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17491,6 +22917,12 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_acquire_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17512,6 +22944,12 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -17532,6 +22970,12 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17550,6 +22994,12 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17566,6 +23016,12 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_seq_cst_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -17583,6 +23039,12 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_cluster_one_as_seq_cst_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -17600,6 +23062,12 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_acquire_cmpxchg( ; GFX11-WGP-LABEL: private_cluster_one_as_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -17615,6 +23083,12 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_acquire_cmpxchg( ; GFX11-CU-LABEL: private_cluster_one_as_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -17630,6 +23104,12 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_acquire_cmpxchg( ; GFX12-WGP-LABEL: private_cluster_one_as_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -17644,6 +23124,12 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-LABEL: private_cluster_one_as_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -17658,26 +23144,32 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_acquire_cmpxchg( ; GFX1250-LABEL: private_cluster_one_as_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -17685,6 +23177,7 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_acquire_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -17708,6 +23201,12 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_seq_cst_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -17728,6 +23227,12 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -17748,6 +23253,12 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17765,6 +23276,12 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17786,6 +23303,12 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -17806,6 +23329,12 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17824,6 +23353,12 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17840,6 +23375,12 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_monotonic_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -17857,6 +23398,12 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_cluster_one_as_monotonic_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -17874,6 +23421,12 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: private_cluster_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -17889,6 +23442,12 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_seq_cst_cmpxchg( ; GFX11-CU-LABEL: private_cluster_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -17904,6 +23463,12 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: private_cluster_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -17918,6 +23483,12 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-LABEL: private_cluster_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -17932,26 +23503,32 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_seq_cst_cmpxchg( ; GFX1250-LABEL: private_cluster_one_as_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -17959,6 +23536,7 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_seq_cst_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -17982,6 +23560,12 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_seq_cst_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -18002,6 +23586,12 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -18022,6 +23612,12 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -18039,6 +23635,12 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -18060,6 +23662,12 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -18080,6 +23688,12 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18098,6 +23712,12 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18114,6 +23734,12 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_acquire_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -18131,6 +23757,12 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_cluster_one_as_acquire_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -18148,6 +23780,12 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: private_cluster_one_as_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -18163,6 +23801,12 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_seq_cst_cmpxchg( ; GFX11-CU-LABEL: private_cluster_one_as_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -18178,6 +23822,12 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: private_cluster_one_as_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -18192,6 +23842,12 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-LABEL: private_cluster_one_as_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -18206,26 +23862,32 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_seq_cst_cmpxchg( ; GFX1250-LABEL: private_cluster_one_as_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -18233,6 +23895,7 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_seq_cst_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -18256,6 +23919,12 @@ define amdgpu_kernel void @private_cluster_one_as_release_seq_cst_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -18276,6 +23945,12 @@ define amdgpu_kernel void @private_cluster_one_as_release_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -18296,6 +23971,12 @@ define amdgpu_kernel void @private_cluster_one_as_release_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -18313,6 +23994,12 @@ define amdgpu_kernel void @private_cluster_one_as_release_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -18334,6 +24021,12 @@ define amdgpu_kernel void @private_cluster_one_as_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -18354,6 +24047,12 @@ define amdgpu_kernel void @private_cluster_one_as_release_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18372,6 +24071,12 @@ define amdgpu_kernel void @private_cluster_one_as_release_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18388,6 +24093,12 @@ define amdgpu_kernel void @private_cluster_one_as_release_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_release_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -18405,6 +24116,12 @@ define amdgpu_kernel void @private_cluster_one_as_release_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_cluster_one_as_release_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -18422,6 +24139,12 @@ define amdgpu_kernel void @private_cluster_one_as_release_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: private_cluster_one_as_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -18437,6 +24160,12 @@ define amdgpu_kernel void @private_cluster_one_as_release_seq_cst_cmpxchg( ; GFX11-CU-LABEL: private_cluster_one_as_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -18452,6 +24181,12 @@ define amdgpu_kernel void @private_cluster_one_as_release_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: private_cluster_one_as_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -18466,6 +24201,12 @@ define amdgpu_kernel void @private_cluster_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-LABEL: private_cluster_one_as_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -18480,26 +24221,32 @@ define amdgpu_kernel void @private_cluster_one_as_release_seq_cst_cmpxchg( ; GFX1250-LABEL: private_cluster_one_as_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -18507,6 +24254,7 @@ define amdgpu_kernel void @private_cluster_one_as_release_seq_cst_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -18530,6 +24278,12 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_seq_cst_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -18550,6 +24304,12 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -18570,6 +24330,12 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -18587,6 +24353,12 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -18608,6 +24380,12 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -18628,6 +24406,12 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18646,6 +24430,12 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18662,6 +24452,12 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_acq_rel_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -18679,6 +24475,12 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_cluster_one_as_acq_rel_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -18696,6 +24498,12 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: private_cluster_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -18711,6 +24519,12 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_seq_cst_cmpxchg( ; GFX11-CU-LABEL: private_cluster_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -18726,6 +24540,12 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: private_cluster_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -18740,6 +24560,12 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-LABEL: private_cluster_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -18754,26 +24580,32 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_seq_cst_cmpxchg( ; GFX1250-LABEL: private_cluster_one_as_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -18781,6 +24613,7 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_seq_cst_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -18804,6 +24637,12 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_seq_cst_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -18824,6 +24663,12 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -18844,6 +24689,12 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -18861,6 +24712,12 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -18882,6 +24739,12 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -18902,6 +24765,12 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18920,6 +24789,12 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18936,6 +24811,12 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_seq_cst_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -18953,6 +24834,12 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_cluster_one_as_seq_cst_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -18970,6 +24857,12 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: private_cluster_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -18985,6 +24878,12 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_seq_cst_cmpxchg( ; GFX11-CU-LABEL: private_cluster_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -19000,6 +24899,12 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: private_cluster_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -19014,6 +24919,12 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-LABEL: private_cluster_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -19028,26 +24939,32 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_seq_cst_cmpxchg( ; GFX1250-LABEL: private_cluster_one_as_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -19055,6 +24972,7 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_seq_cst_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -19079,6 +24997,12 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_monotonic_ret_cmpxch ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -19103,6 +25027,12 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_monotonic_ret_cmpxch ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -19125,6 +25055,12 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_monotonic_ret_cmpxch ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -19144,6 +25080,12 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_monotonic_ret_cmpxch ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -19168,6 +25110,12 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_monotonic_ret_cmpxch ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -19190,6 +25138,12 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_monotonic_ret_cmpxch ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19210,6 +25164,12 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_monotonic_ret_cmpxch ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19229,6 +25189,12 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_monotonic_ret_cmpxch ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -19249,6 +25215,12 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_monotonic_ret_cmpxch ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -19268,6 +25240,12 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_monotonic_ret_cmpxch ; GFX11-WGP-LABEL: private_cluster_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -19284,6 +25262,12 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_monotonic_ret_cmpxch ; GFX11-CU-LABEL: private_cluster_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -19300,6 +25284,12 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_monotonic_ret_cmpxch ; GFX12-WGP-LABEL: private_cluster_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -19315,6 +25305,12 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_monotonic_ret_cmpxch ; GFX12-CU-LABEL: private_cluster_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -19331,25 +25327,31 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_monotonic_ret_cmpxch ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -19381,6 +25383,12 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_monotonic_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -19405,6 +25413,12 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -19427,6 +25441,12 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -19446,6 +25466,12 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -19470,6 +25496,12 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -19492,6 +25524,12 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19512,6 +25550,12 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19531,6 +25575,12 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -19551,6 +25601,12 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -19570,6 +25626,12 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: private_cluster_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -19586,6 +25648,12 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: private_cluster_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -19602,6 +25670,12 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: private_cluster_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -19617,6 +25691,12 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: private_cluster_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -19633,25 +25713,31 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -19685,6 +25771,12 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -19709,6 +25801,12 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -19731,6 +25829,12 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -19750,6 +25854,12 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -19774,6 +25884,12 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -19796,6 +25912,12 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19816,6 +25938,12 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19835,6 +25963,12 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -19855,6 +25989,12 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -19874,6 +26014,12 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: private_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -19890,6 +26036,12 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: private_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -19906,6 +26058,12 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: private_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -19921,6 +26079,12 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: private_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -19937,25 +26101,31 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -19991,6 +26161,12 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -20015,6 +26191,12 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -20037,6 +26219,12 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -20056,6 +26244,12 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -20080,6 +26274,12 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -20102,6 +26302,12 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20122,6 +26328,12 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20141,6 +26353,12 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -20161,6 +26379,12 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -20180,6 +26404,12 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: private_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -20196,6 +26426,12 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: private_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -20212,6 +26448,12 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: private_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -20227,6 +26469,12 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: private_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -20243,25 +26491,31 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -20297,6 +26551,12 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_acquire_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -20321,6 +26581,12 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_acquire_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -20343,6 +26609,12 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -20362,6 +26634,12 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -20386,6 +26664,12 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -20408,6 +26692,12 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20428,6 +26718,12 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20447,6 +26743,12 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -20467,6 +26769,12 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_acquire_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -20486,6 +26794,12 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: private_cluster_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -20502,6 +26816,12 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: private_cluster_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -20518,6 +26838,12 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: private_cluster_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -20533,6 +26859,12 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: private_cluster_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -20549,25 +26881,31 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -20601,6 +26939,12 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_acquire_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -20625,6 +26969,12 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_acquire_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -20647,6 +26997,12 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -20666,6 +27022,12 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -20690,6 +27052,12 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -20712,6 +27080,12 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20732,6 +27106,12 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20751,6 +27131,12 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -20771,6 +27157,12 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_acquire_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -20790,6 +27182,12 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: private_cluster_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -20806,6 +27204,12 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: private_cluster_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -20822,6 +27226,12 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: private_cluster_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -20837,6 +27247,12 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: private_cluster_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -20853,25 +27269,31 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -20905,6 +27327,12 @@ define amdgpu_kernel void @private_cluster_one_as_release_acquire_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -20929,6 +27357,12 @@ define amdgpu_kernel void @private_cluster_one_as_release_acquire_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -20951,6 +27385,12 @@ define amdgpu_kernel void @private_cluster_one_as_release_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -20970,6 +27410,12 @@ define amdgpu_kernel void @private_cluster_one_as_release_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -20994,6 +27440,12 @@ define amdgpu_kernel void @private_cluster_one_as_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -21016,6 +27468,12 @@ define amdgpu_kernel void @private_cluster_one_as_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21036,6 +27494,12 @@ define amdgpu_kernel void @private_cluster_one_as_release_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21055,6 +27519,12 @@ define amdgpu_kernel void @private_cluster_one_as_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -21075,6 +27545,12 @@ define amdgpu_kernel void @private_cluster_one_as_release_acquire_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -21094,6 +27570,12 @@ define amdgpu_kernel void @private_cluster_one_as_release_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: private_cluster_one_as_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -21110,6 +27592,12 @@ define amdgpu_kernel void @private_cluster_one_as_release_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: private_cluster_one_as_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -21126,6 +27614,12 @@ define amdgpu_kernel void @private_cluster_one_as_release_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: private_cluster_one_as_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -21141,6 +27635,12 @@ define amdgpu_kernel void @private_cluster_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: private_cluster_one_as_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -21157,25 +27657,31 @@ define amdgpu_kernel void @private_cluster_one_as_release_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -21211,6 +27717,12 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -21235,6 +27747,12 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -21257,6 +27775,12 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -21276,6 +27800,12 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -21300,6 +27830,12 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -21322,6 +27858,12 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21342,6 +27884,12 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21361,6 +27909,12 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -21381,6 +27935,12 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -21400,6 +27960,12 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: private_cluster_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -21416,6 +27982,12 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: private_cluster_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -21432,6 +28004,12 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: private_cluster_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -21447,6 +28025,12 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: private_cluster_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -21463,25 +28047,31 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -21517,6 +28107,12 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -21541,6 +28137,12 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -21563,6 +28165,12 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -21582,6 +28190,12 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -21606,6 +28220,12 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -21628,6 +28248,12 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21648,6 +28274,12 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21667,6 +28299,12 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -21687,6 +28325,12 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -21706,6 +28350,12 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: private_cluster_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -21722,6 +28372,12 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: private_cluster_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -21738,6 +28394,12 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: private_cluster_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -21753,6 +28415,12 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: private_cluster_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -21769,25 +28437,31 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -21823,6 +28497,12 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -21847,6 +28527,12 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -21869,6 +28555,12 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -21888,6 +28580,12 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -21912,6 +28610,12 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -21934,6 +28638,12 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21954,6 +28664,12 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21973,6 +28689,12 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -21993,6 +28715,12 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -22012,6 +28740,12 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: private_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -22028,6 +28762,12 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: private_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -22044,6 +28784,12 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: private_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -22059,6 +28805,12 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: private_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -22075,25 +28827,31 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -22129,6 +28887,12 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -22153,6 +28917,12 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -22175,6 +28945,12 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -22194,6 +28970,12 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -22218,6 +29000,12 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -22240,6 +29028,12 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -22260,6 +29054,12 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -22279,6 +29079,12 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -22299,6 +29105,12 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -22318,6 +29130,12 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: private_cluster_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -22334,6 +29152,12 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: private_cluster_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -22350,6 +29174,12 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: private_cluster_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -22365,6 +29195,12 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: private_cluster_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -22381,25 +29217,31 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -22435,6 +29277,12 @@ define amdgpu_kernel void @private_cluster_one_as_release_seq_cst_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -22459,6 +29307,12 @@ define amdgpu_kernel void @private_cluster_one_as_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -22481,6 +29335,12 @@ define amdgpu_kernel void @private_cluster_one_as_release_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -22500,6 +29360,12 @@ define amdgpu_kernel void @private_cluster_one_as_release_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -22524,6 +29390,12 @@ define amdgpu_kernel void @private_cluster_one_as_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -22546,6 +29418,12 @@ define amdgpu_kernel void @private_cluster_one_as_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -22566,6 +29444,12 @@ define amdgpu_kernel void @private_cluster_one_as_release_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -22585,6 +29469,12 @@ define amdgpu_kernel void @private_cluster_one_as_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -22605,6 +29495,12 @@ define amdgpu_kernel void @private_cluster_one_as_release_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -22624,6 +29520,12 @@ define amdgpu_kernel void @private_cluster_one_as_release_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: private_cluster_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -22640,6 +29542,12 @@ define amdgpu_kernel void @private_cluster_one_as_release_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: private_cluster_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -22656,6 +29564,12 @@ define amdgpu_kernel void @private_cluster_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: private_cluster_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -22671,6 +29585,12 @@ define amdgpu_kernel void @private_cluster_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: private_cluster_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -22687,25 +29607,31 @@ define amdgpu_kernel void @private_cluster_one_as_release_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -22741,6 +29667,12 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -22765,6 +29697,12 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -22787,6 +29725,12 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -22806,6 +29750,12 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -22830,6 +29780,12 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -22852,6 +29808,12 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -22872,6 +29834,12 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -22891,6 +29859,12 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -22911,6 +29885,12 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -22930,6 +29910,12 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: private_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -22946,6 +29932,12 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: private_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -22962,6 +29954,12 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: private_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -22977,6 +29975,12 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: private_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -22993,25 +29997,31 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -23047,6 +30057,12 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -23071,6 +30087,12 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -23093,6 +30115,12 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -23112,6 +30140,12 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -23136,6 +30170,12 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -23158,6 +30198,12 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -23178,6 +30224,12 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -23197,6 +30249,12 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -23217,6 +30275,12 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -23236,6 +30300,12 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: private_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -23252,6 +30322,12 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: private_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -23268,6 +30344,12 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: private_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -23283,6 +30365,12 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: private_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -23299,25 +30387,31 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-lastuse.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-lastuse.ll index f7bdceb5bd5c..07c5c809f0da 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-lastuse.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-lastuse.ll @@ -6,24 +6,32 @@ define amdgpu_kernel void @private_last_use_load_0(ptr addrspace(5) %in, ptr addrspace(1) %out) { ; GFX12-LABEL: private_last_use_load_0: ; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: scratch_load_b32 v1, off, s2 th:TH_LOAD_LU ; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_endpgm ; ; GFX1250-LABEL: private_last_use_load_0: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: scratch_load_b32 v1, off, s2 th:TH_LOAD_LU ; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm entry: @@ -36,7 +44,11 @@ define amdgpu_kernel void @private_last_use_load_1(ptr addrspace(5) %in, ptr add ; GFX12-LABEL: private_last_use_load_1: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: v_mov_b32_e32 v1, v0 +; GFX12-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_mov_b32 s3, 0x3ff @@ -44,9 +56,9 @@ define amdgpu_kernel void @private_last_use_load_1(ptr addrspace(5) %in, ptr add ; GFX12-NEXT: s_mov_b32 s3, 2 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: v_lshlrev_b32_e64 v1, s3, v1 -; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: scratch_load_b32 v1, v1, s2 th:TH_LOAD_LU ; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_endpgm ; @@ -54,14 +66,18 @@ define amdgpu_kernel void @private_last_use_load_1(ptr addrspace(5) %in, ptr add ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: v_mov_b32_e32 v1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_mov_b32 s3, 0x3ff ; GFX1250-NEXT: v_and_b32_e64 v1, v1, s3 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: scratch_load_b32 v1, v1, s2 scale_offset th:TH_LOAD_LU ; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm entry: @@ -75,27 +91,35 @@ entry: define amdgpu_kernel void @private_last_use_and_volatile_load(ptr addrspace(5) %in, ptr addrspace(1) %out) { ; GFX12-LABEL: private_last_use_and_volatile_load: ; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: scratch_load_b32 v1, off, s2 th:TH_LOAD_BYPASS scope:SCOPE_SYS ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_endpgm ; ; GFX1250-LABEL: private_last_use_and_volatile_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: scratch_load_b32 v1, off, s2 th:TH_LOAD_BYPASS scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm entry: @@ -107,24 +131,32 @@ entry: define amdgpu_kernel void @private_last_use_and_nontemporal_load(ptr addrspace(5) %in, ptr addrspace(1) %out) { ; GFX12-LABEL: private_last_use_and_nontemporal_load: ; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: scratch_load_b32 v1, off, s2 th:TH_LOAD_LU ; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_endpgm ; ; GFX1250-LABEL: private_last_use_and_nontemporal_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: scratch_load_b32 v1, off, s2 th:TH_LOAD_LU ; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll index 5e79e414b741..69aa48ff1daa 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll @@ -20,6 +20,9 @@ define amdgpu_kernel void @private_nontemporal_load_0( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -44,11 +47,15 @@ define amdgpu_kernel void @private_nontemporal_load_0( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen glc slc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -59,13 +66,16 @@ define amdgpu_kernel void @private_nontemporal_load_0( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen slc -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; @@ -73,13 +83,16 @@ define amdgpu_kernel void @private_nontemporal_load_0( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen slc -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; @@ -92,6 +105,9 @@ define amdgpu_kernel void @private_nontemporal_load_0( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -113,13 +129,16 @@ define amdgpu_kernel void @private_nontemporal_load_0( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen glc slc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -127,91 +146,118 @@ define amdgpu_kernel void @private_nontemporal_load_0( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen glc slc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: private_nontemporal_load_0: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v1, off, s2 nt -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_nontemporal_load_0: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_load_dword v1, off, s2 nt -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_nontemporal_load_0: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_load_b32 v1, off, s2 slc dlc -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_nontemporal_load_0: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_load_b32 v1, off, s2 slc dlc -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_nontemporal_load_0: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_load_b32 v1, off, s2 th:TH_LOAD_NT ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_nontemporal_load_0: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_load_b32 v1, off, s2 th:TH_LOAD_NT ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: private_nontemporal_load_0: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: scratch_load_b32 v1, off, s2 th:TH_LOAD_NT ; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %in, ptr addrspace(1) %out) { @@ -227,6 +273,9 @@ define amdgpu_kernel void @private_nontemporal_load_1( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -253,13 +302,17 @@ define amdgpu_kernel void @private_nontemporal_load_1( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 2 ; GFX7-NEXT: v_lshlrev_b32_e64 v0, s7, v0 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e64 v0, s[6:7], s6, v0 ; GFX7-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen glc slc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -271,13 +324,16 @@ define amdgpu_kernel void @private_nontemporal_load_1( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_lshl_add_u32 v1, v1, 2, s6 ; GFX10-WGP-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen slc -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; @@ -286,13 +342,16 @@ define amdgpu_kernel void @private_nontemporal_load_1( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_lshl_add_u32 v1, v1, 2, s6 ; GFX10-CU-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen slc -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; @@ -305,6 +364,9 @@ define amdgpu_kernel void @private_nontemporal_load_1( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -329,15 +391,18 @@ define amdgpu_kernel void @private_nontemporal_load_1( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s7, 0x3ff ; GFX90A-NOTTGSPLIT-NEXT: v_and_b32_e64 v1, v1, s7 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_lshl_add_u32 v1, v1, 2, s6 ; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen glc slc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -346,82 +411,101 @@ define amdgpu_kernel void @private_nontemporal_load_1( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b32 s7, 0x3ff ; GFX90A-TGSPLIT-NEXT: v_and_b32_e64 v1, v1, s7 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_lshl_add_u32 v1, v1, 2, s6 ; GFX90A-TGSPLIT-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen glc slc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: private_nontemporal_load_1: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 0x3ff ; GFX942-NOTTGSPLIT-NEXT: v_and_b32_e64 v1, v1, s3 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_lshl_add_u32 v1, v1, 2, s2 ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v1, v1, off nt -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_nontemporal_load_1: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 0x3ff ; GFX942-TGSPLIT-NEXT: v_and_b32_e64 v1, v1, s3 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_lshl_add_u32 v1, v1, 2, s2 ; GFX942-TGSPLIT-NEXT: scratch_load_dword v1, v1, off nt -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_nontemporal_load_1: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_mov_b32 s3, 0x3ff ; GFX11-WGP-NEXT: v_and_b32_e64 v1, v1, s3 -; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_lshl_add_u32 v1, v1, 2, s2 ; GFX11-WGP-NEXT: scratch_load_b32 v1, v1, off slc dlc -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_nontemporal_load_1: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_mov_b32 s3, 0x3ff ; GFX11-CU-NEXT: v_and_b32_e64 v1, v1, s3 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_lshl_add_u32 v1, v1, 2, s2 ; GFX11-CU-NEXT: scratch_load_b32 v1, v1, off slc dlc -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_nontemporal_load_1: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_mov_b32 s3, 0x3ff @@ -429,16 +513,20 @@ define amdgpu_kernel void @private_nontemporal_load_1( ; GFX12-WGP-NEXT: s_mov_b32 s3, 2 ; GFX12-WGP-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-WGP-NEXT: v_lshlrev_b32_e64 v1, s3, v1 -; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_load_b32 v1, v1, s2 th:TH_LOAD_NT ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_nontemporal_load_1: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: v_mov_b32_e32 v1, v0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_mov_b32 s3, 0x3ff @@ -446,9 +534,9 @@ define amdgpu_kernel void @private_nontemporal_load_1( ; GFX12-CU-NEXT: s_mov_b32 s3, 2 ; GFX12-CU-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-CU-NEXT: v_lshlrev_b32_e64 v1, s3, v1 -; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_load_b32 v1, v1, s2 th:TH_LOAD_NT ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; @@ -456,14 +544,18 @@ define amdgpu_kernel void @private_nontemporal_load_1( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: v_mov_b32_e32 v1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_mov_b32 s3, 0x3ff ; GFX1250-NEXT: v_and_b32_e64 v1, v1, s3 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: scratch_load_b32 v1, v1, s2 scale_offset th:TH_LOAD_NT ; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %in, ptr addrspace(1) %out) { @@ -480,9 +572,12 @@ define amdgpu_kernel void @private_nontemporal_store_0( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -494,9 +589,12 @@ define amdgpu_kernel void @private_nontemporal_store_0( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -508,9 +606,12 @@ define amdgpu_kernel void @private_nontemporal_store_0( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 @@ -522,9 +623,12 @@ define amdgpu_kernel void @private_nontemporal_store_0( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 @@ -540,9 +644,12 @@ define amdgpu_kernel void @private_nontemporal_store_0( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -554,9 +661,12 @@ define amdgpu_kernel void @private_nontemporal_store_0( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -568,9 +678,12 @@ define amdgpu_kernel void @private_nontemporal_store_0( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -580,9 +693,12 @@ define amdgpu_kernel void @private_nontemporal_store_0( ; ; GFX942-NOTTGSPLIT-LABEL: private_nontemporal_store_0: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -591,9 +707,12 @@ define amdgpu_kernel void @private_nontemporal_store_0( ; ; GFX942-TGSPLIT-LABEL: private_nontemporal_store_0: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -602,9 +721,12 @@ define amdgpu_kernel void @private_nontemporal_store_0( ; ; GFX11-WGP-LABEL: private_nontemporal_store_0: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -613,9 +735,12 @@ define amdgpu_kernel void @private_nontemporal_store_0( ; ; GFX11-CU-LABEL: private_nontemporal_store_0: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -624,9 +749,12 @@ define amdgpu_kernel void @private_nontemporal_store_0( ; ; GFX12-WGP-LABEL: private_nontemporal_store_0: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -635,9 +763,12 @@ define amdgpu_kernel void @private_nontemporal_store_0( ; ; GFX12-CU-LABEL: private_nontemporal_store_0: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -647,9 +778,12 @@ define amdgpu_kernel void @private_nontemporal_store_0( ; GFX1250-LABEL: private_nontemporal_store_0: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s1, s[2:3], 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 @@ -667,6 +801,9 @@ define amdgpu_kernel void @private_nontemporal_store_1( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -683,6 +820,9 @@ define amdgpu_kernel void @private_nontemporal_store_1( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -699,6 +839,9 @@ define amdgpu_kernel void @private_nontemporal_store_1( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -713,6 +856,9 @@ define amdgpu_kernel void @private_nontemporal_store_1( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -731,6 +877,9 @@ define amdgpu_kernel void @private_nontemporal_store_1( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -747,6 +896,9 @@ define amdgpu_kernel void @private_nontemporal_store_1( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -763,6 +915,9 @@ define amdgpu_kernel void @private_nontemporal_store_1( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -777,6 +932,9 @@ define amdgpu_kernel void @private_nontemporal_store_1( ; ; GFX942-NOTTGSPLIT-LABEL: private_nontemporal_store_1: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -791,6 +949,9 @@ define amdgpu_kernel void @private_nontemporal_store_1( ; ; GFX942-TGSPLIT-LABEL: private_nontemporal_store_1: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -805,6 +966,9 @@ define amdgpu_kernel void @private_nontemporal_store_1( ; ; GFX11-WGP-LABEL: private_nontemporal_store_1: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -819,6 +983,9 @@ define amdgpu_kernel void @private_nontemporal_store_1( ; ; GFX11-CU-LABEL: private_nontemporal_store_1: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -833,9 +1000,12 @@ define amdgpu_kernel void @private_nontemporal_store_1( ; ; GFX12-WGP-LABEL: private_nontemporal_store_1: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 ; GFX12-WGP-NEXT: s_mov_b32 s2, 0x3ff ; GFX12-WGP-NEXT: v_and_b32_e64 v0, v0, s2 @@ -849,9 +1019,12 @@ define amdgpu_kernel void @private_nontemporal_store_1( ; ; GFX12-CU-LABEL: private_nontemporal_store_1: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 ; GFX12-CU-NEXT: s_mov_b32 s2, 0x3ff ; GFX12-CU-NEXT: v_and_b32_e64 v0, v0, s2 @@ -866,9 +1039,12 @@ define amdgpu_kernel void @private_nontemporal_store_1( ; GFX1250-LABEL: private_nontemporal_store_1: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s1, s[2:3], 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b32 s2, 0x3ff @@ -892,6 +1068,9 @@ define amdgpu_kernel void @private_nontemporal_volatile_load( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -916,12 +1095,15 @@ define amdgpu_kernel void @private_nontemporal_volatile_load( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen glc -; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: flat_store_dword v[0:1], v2 @@ -931,13 +1113,16 @@ define amdgpu_kernel void @private_nontemporal_volatile_load( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen glc dlc -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; @@ -945,13 +1130,16 @@ define amdgpu_kernel void @private_nontemporal_volatile_load( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen glc dlc -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; @@ -964,6 +1152,9 @@ define amdgpu_kernel void @private_nontemporal_volatile_load( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -985,13 +1176,16 @@ define amdgpu_kernel void @private_nontemporal_volatile_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen glc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -999,96 +1193,123 @@ define amdgpu_kernel void @private_nontemporal_volatile_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen glc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: private_nontemporal_volatile_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v1, off, s2 sc0 sc1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_nontemporal_volatile_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_load_dword v1, off, s2 sc0 sc1 -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_nontemporal_volatile_load: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_load_b32 v1, off, s2 glc dlc -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_nontemporal_volatile_load: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_load_b32 v1, off, s2 glc dlc -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_nontemporal_volatile_load: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_load_b32 v1, off, s2 th:TH_LOAD_NT scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_nontemporal_volatile_load: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_load_b32 v1, off, s2 th:TH_LOAD_NT scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: private_nontemporal_volatile_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: scratch_load_b32 v1, off, s2 th:TH_LOAD_NT scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %in, ptr addrspace(1) %out) { diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-singlethread.ll index 929665f37737..238ccd2e7bac 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-singlethread.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-singlethread.ll @@ -19,11 +19,15 @@ define amdgpu_kernel void @private_singlethread_unordered_load( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -33,11 +37,15 @@ define amdgpu_kernel void @private_singlethread_unordered_load( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -47,11 +55,15 @@ define amdgpu_kernel void @private_singlethread_unordered_load( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -61,11 +73,15 @@ define amdgpu_kernel void @private_singlethread_unordered_load( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -79,11 +95,15 @@ define amdgpu_kernel void @private_singlethread_unordered_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -93,11 +113,15 @@ define amdgpu_kernel void @private_singlethread_unordered_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -107,11 +131,15 @@ define amdgpu_kernel void @private_singlethread_unordered_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -119,84 +147,108 @@ define amdgpu_kernel void @private_singlethread_unordered_load( ; ; GFX942-NOTTGSPLIT-LABEL: private_singlethread_unordered_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_singlethread_unordered_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_singlethread_unordered_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_singlethread_unordered_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_singlethread_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_singlethread_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: private_singlethread_unordered_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -205,6 +257,7 @@ define amdgpu_kernel void @private_singlethread_unordered_load( ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: flat_load_b32 v0, v[0:1] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %in, ptr addrspace(5) %out) { @@ -219,11 +272,15 @@ define amdgpu_kernel void @private_singlethread_monotonic_load( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -233,11 +290,15 @@ define amdgpu_kernel void @private_singlethread_monotonic_load( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -247,11 +308,15 @@ define amdgpu_kernel void @private_singlethread_monotonic_load( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -261,11 +326,15 @@ define amdgpu_kernel void @private_singlethread_monotonic_load( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -279,11 +348,15 @@ define amdgpu_kernel void @private_singlethread_monotonic_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -293,11 +366,15 @@ define amdgpu_kernel void @private_singlethread_monotonic_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -307,11 +384,15 @@ define amdgpu_kernel void @private_singlethread_monotonic_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -319,84 +400,108 @@ define amdgpu_kernel void @private_singlethread_monotonic_load( ; ; GFX942-NOTTGSPLIT-LABEL: private_singlethread_monotonic_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_singlethread_monotonic_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_singlethread_monotonic_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_singlethread_monotonic_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_singlethread_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_singlethread_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: private_singlethread_monotonic_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -405,6 +510,7 @@ define amdgpu_kernel void @private_singlethread_monotonic_load( ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: flat_load_b32 v0, v[0:1] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %in, ptr addrspace(5) %out) { @@ -419,11 +525,15 @@ define amdgpu_kernel void @private_singlethread_acquire_load( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -433,11 +543,15 @@ define amdgpu_kernel void @private_singlethread_acquire_load( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -447,11 +561,15 @@ define amdgpu_kernel void @private_singlethread_acquire_load( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -461,11 +579,15 @@ define amdgpu_kernel void @private_singlethread_acquire_load( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -479,11 +601,15 @@ define amdgpu_kernel void @private_singlethread_acquire_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -493,11 +619,15 @@ define amdgpu_kernel void @private_singlethread_acquire_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -507,11 +637,15 @@ define amdgpu_kernel void @private_singlethread_acquire_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -519,84 +653,108 @@ define amdgpu_kernel void @private_singlethread_acquire_load( ; ; GFX942-NOTTGSPLIT-LABEL: private_singlethread_acquire_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_singlethread_acquire_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_singlethread_acquire_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_singlethread_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_singlethread_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_singlethread_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: private_singlethread_acquire_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -605,6 +763,7 @@ define amdgpu_kernel void @private_singlethread_acquire_load( ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: flat_load_b32 v0, v[0:1] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %in, ptr addrspace(5) %out) { @@ -619,11 +778,15 @@ define amdgpu_kernel void @private_singlethread_seq_cst_load( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -633,11 +796,15 @@ define amdgpu_kernel void @private_singlethread_seq_cst_load( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -647,11 +814,15 @@ define amdgpu_kernel void @private_singlethread_seq_cst_load( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -661,11 +832,15 @@ define amdgpu_kernel void @private_singlethread_seq_cst_load( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -679,11 +854,15 @@ define amdgpu_kernel void @private_singlethread_seq_cst_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -693,11 +872,15 @@ define amdgpu_kernel void @private_singlethread_seq_cst_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -707,11 +890,15 @@ define amdgpu_kernel void @private_singlethread_seq_cst_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -719,84 +906,108 @@ define amdgpu_kernel void @private_singlethread_seq_cst_load( ; ; GFX942-NOTTGSPLIT-LABEL: private_singlethread_seq_cst_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_singlethread_seq_cst_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_singlethread_seq_cst_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_singlethread_seq_cst_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_singlethread_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_singlethread_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: private_singlethread_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -805,6 +1016,7 @@ define amdgpu_kernel void @private_singlethread_seq_cst_load( ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: flat_load_b32 v0, v[0:1] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %in, ptr addrspace(5) %out) { @@ -819,10 +1031,14 @@ define amdgpu_kernel void @private_singlethread_unordered_store( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX6-NEXT: s_endpgm @@ -831,10 +1047,14 @@ define amdgpu_kernel void @private_singlethread_unordered_store( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX7-NEXT: s_endpgm @@ -843,10 +1063,14 @@ define amdgpu_kernel void @private_singlethread_unordered_store( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-WGP-NEXT: s_endpgm @@ -855,10 +1079,14 @@ define amdgpu_kernel void @private_singlethread_unordered_store( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-CU-NEXT: s_endpgm @@ -871,10 +1099,14 @@ define amdgpu_kernel void @private_singlethread_unordered_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -883,10 +1115,14 @@ define amdgpu_kernel void @private_singlethread_unordered_store( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -895,65 +1131,93 @@ define amdgpu_kernel void @private_singlethread_unordered_store( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: private_singlethread_unordered_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_singlethread_unordered_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_singlethread_unordered_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_singlethread_unordered_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_singlethread_unordered_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_singlethread_unordered_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; @@ -961,22 +1225,26 @@ define amdgpu_kernel void @private_singlethread_unordered_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -998,10 +1266,14 @@ define amdgpu_kernel void @private_singlethread_monotonic_store( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX6-NEXT: s_endpgm @@ -1010,10 +1282,14 @@ define amdgpu_kernel void @private_singlethread_monotonic_store( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX7-NEXT: s_endpgm @@ -1022,10 +1298,14 @@ define amdgpu_kernel void @private_singlethread_monotonic_store( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-WGP-NEXT: s_endpgm @@ -1034,10 +1314,14 @@ define amdgpu_kernel void @private_singlethread_monotonic_store( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-CU-NEXT: s_endpgm @@ -1050,10 +1334,14 @@ define amdgpu_kernel void @private_singlethread_monotonic_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -1062,10 +1350,14 @@ define amdgpu_kernel void @private_singlethread_monotonic_store( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -1074,65 +1366,93 @@ define amdgpu_kernel void @private_singlethread_monotonic_store( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: private_singlethread_monotonic_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_singlethread_monotonic_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_singlethread_monotonic_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_singlethread_monotonic_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_singlethread_monotonic_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_singlethread_monotonic_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; @@ -1140,22 +1460,26 @@ define amdgpu_kernel void @private_singlethread_monotonic_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -1177,10 +1501,14 @@ define amdgpu_kernel void @private_singlethread_release_store( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX6-NEXT: s_endpgm @@ -1189,10 +1517,14 @@ define amdgpu_kernel void @private_singlethread_release_store( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX7-NEXT: s_endpgm @@ -1201,10 +1533,14 @@ define amdgpu_kernel void @private_singlethread_release_store( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-WGP-NEXT: s_endpgm @@ -1213,10 +1549,14 @@ define amdgpu_kernel void @private_singlethread_release_store( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-CU-NEXT: s_endpgm @@ -1229,10 +1569,14 @@ define amdgpu_kernel void @private_singlethread_release_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -1241,10 +1585,14 @@ define amdgpu_kernel void @private_singlethread_release_store( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -1253,65 +1601,93 @@ define amdgpu_kernel void @private_singlethread_release_store( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: private_singlethread_release_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_singlethread_release_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_singlethread_release_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_singlethread_release_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_singlethread_release_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_singlethread_release_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; @@ -1319,22 +1695,26 @@ define amdgpu_kernel void @private_singlethread_release_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -1356,10 +1736,14 @@ define amdgpu_kernel void @private_singlethread_seq_cst_store( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX6-NEXT: s_endpgm @@ -1368,10 +1752,14 @@ define amdgpu_kernel void @private_singlethread_seq_cst_store( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX7-NEXT: s_endpgm @@ -1380,10 +1768,14 @@ define amdgpu_kernel void @private_singlethread_seq_cst_store( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-WGP-NEXT: s_endpgm @@ -1392,10 +1784,14 @@ define amdgpu_kernel void @private_singlethread_seq_cst_store( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-CU-NEXT: s_endpgm @@ -1408,10 +1804,14 @@ define amdgpu_kernel void @private_singlethread_seq_cst_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -1420,10 +1820,14 @@ define amdgpu_kernel void @private_singlethread_seq_cst_store( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -1432,65 +1836,93 @@ define amdgpu_kernel void @private_singlethread_seq_cst_store( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: private_singlethread_seq_cst_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_singlethread_seq_cst_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_singlethread_seq_cst_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_singlethread_seq_cst_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_singlethread_seq_cst_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_singlethread_seq_cst_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; @@ -1498,22 +1930,26 @@ define amdgpu_kernel void @private_singlethread_seq_cst_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -1536,8 +1972,15 @@ define amdgpu_kernel void @private_singlethread_monotonic_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1548,8 +1991,15 @@ define amdgpu_kernel void @private_singlethread_monotonic_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1560,8 +2010,15 @@ define amdgpu_kernel void @private_singlethread_monotonic_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1572,8 +2029,15 @@ define amdgpu_kernel void @private_singlethread_monotonic_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1588,8 +2052,15 @@ define amdgpu_kernel void @private_singlethread_monotonic_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -1600,8 +2071,15 @@ define amdgpu_kernel void @private_singlethread_monotonic_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1612,8 +2090,15 @@ define amdgpu_kernel void @private_singlethread_monotonic_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1622,8 +2107,14 @@ define amdgpu_kernel void @private_singlethread_monotonic_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_singlethread_monotonic_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -1631,8 +2122,14 @@ define amdgpu_kernel void @private_singlethread_monotonic_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_singlethread_monotonic_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm @@ -1640,8 +2137,14 @@ define amdgpu_kernel void @private_singlethread_monotonic_atomicrmw( ; GFX11-WGP-LABEL: private_singlethread_monotonic_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm @@ -1649,8 +2152,14 @@ define amdgpu_kernel void @private_singlethread_monotonic_atomicrmw( ; GFX11-CU-LABEL: private_singlethread_monotonic_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm @@ -1658,8 +2167,14 @@ define amdgpu_kernel void @private_singlethread_monotonic_atomicrmw( ; GFX12-WGP-LABEL: private_singlethread_monotonic_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm @@ -1667,8 +2182,14 @@ define amdgpu_kernel void @private_singlethread_monotonic_atomicrmw( ; GFX12-CU-LABEL: private_singlethread_monotonic_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm @@ -1676,29 +2197,34 @@ define amdgpu_kernel void @private_singlethread_monotonic_atomicrmw( ; GFX1250-LABEL: private_singlethread_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 ; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 @@ -1715,8 +2241,15 @@ define amdgpu_kernel void @private_singlethread_acquire_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1727,8 +2260,15 @@ define amdgpu_kernel void @private_singlethread_acquire_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1739,8 +2279,15 @@ define amdgpu_kernel void @private_singlethread_acquire_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1751,8 +2298,15 @@ define amdgpu_kernel void @private_singlethread_acquire_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1767,8 +2321,15 @@ define amdgpu_kernel void @private_singlethread_acquire_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -1779,8 +2340,15 @@ define amdgpu_kernel void @private_singlethread_acquire_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1791,8 +2359,15 @@ define amdgpu_kernel void @private_singlethread_acquire_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1801,8 +2376,14 @@ define amdgpu_kernel void @private_singlethread_acquire_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_singlethread_acquire_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -1810,8 +2391,14 @@ define amdgpu_kernel void @private_singlethread_acquire_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_singlethread_acquire_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm @@ -1819,8 +2406,14 @@ define amdgpu_kernel void @private_singlethread_acquire_atomicrmw( ; GFX11-WGP-LABEL: private_singlethread_acquire_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm @@ -1828,8 +2421,14 @@ define amdgpu_kernel void @private_singlethread_acquire_atomicrmw( ; GFX11-CU-LABEL: private_singlethread_acquire_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm @@ -1837,8 +2436,14 @@ define amdgpu_kernel void @private_singlethread_acquire_atomicrmw( ; GFX12-WGP-LABEL: private_singlethread_acquire_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm @@ -1846,8 +2451,14 @@ define amdgpu_kernel void @private_singlethread_acquire_atomicrmw( ; GFX12-CU-LABEL: private_singlethread_acquire_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm @@ -1855,29 +2466,34 @@ define amdgpu_kernel void @private_singlethread_acquire_atomicrmw( ; GFX1250-LABEL: private_singlethread_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 ; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 @@ -1894,8 +2510,15 @@ define amdgpu_kernel void @private_singlethread_release_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1906,8 +2529,15 @@ define amdgpu_kernel void @private_singlethread_release_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1918,8 +2548,15 @@ define amdgpu_kernel void @private_singlethread_release_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1930,8 +2567,15 @@ define amdgpu_kernel void @private_singlethread_release_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1946,8 +2590,15 @@ define amdgpu_kernel void @private_singlethread_release_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -1958,8 +2609,15 @@ define amdgpu_kernel void @private_singlethread_release_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1970,8 +2628,15 @@ define amdgpu_kernel void @private_singlethread_release_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1980,8 +2645,14 @@ define amdgpu_kernel void @private_singlethread_release_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_singlethread_release_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -1989,8 +2660,14 @@ define amdgpu_kernel void @private_singlethread_release_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_singlethread_release_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm @@ -1998,8 +2675,14 @@ define amdgpu_kernel void @private_singlethread_release_atomicrmw( ; GFX11-WGP-LABEL: private_singlethread_release_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm @@ -2007,8 +2690,14 @@ define amdgpu_kernel void @private_singlethread_release_atomicrmw( ; GFX11-CU-LABEL: private_singlethread_release_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm @@ -2016,8 +2705,14 @@ define amdgpu_kernel void @private_singlethread_release_atomicrmw( ; GFX12-WGP-LABEL: private_singlethread_release_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm @@ -2025,8 +2720,14 @@ define amdgpu_kernel void @private_singlethread_release_atomicrmw( ; GFX12-CU-LABEL: private_singlethread_release_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm @@ -2034,29 +2735,34 @@ define amdgpu_kernel void @private_singlethread_release_atomicrmw( ; GFX1250-LABEL: private_singlethread_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 ; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 @@ -2073,8 +2779,15 @@ define amdgpu_kernel void @private_singlethread_acq_rel_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -2085,8 +2798,15 @@ define amdgpu_kernel void @private_singlethread_acq_rel_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -2097,8 +2817,15 @@ define amdgpu_kernel void @private_singlethread_acq_rel_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -2109,8 +2836,15 @@ define amdgpu_kernel void @private_singlethread_acq_rel_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -2125,8 +2859,15 @@ define amdgpu_kernel void @private_singlethread_acq_rel_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -2137,8 +2878,15 @@ define amdgpu_kernel void @private_singlethread_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -2149,8 +2897,15 @@ define amdgpu_kernel void @private_singlethread_acq_rel_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -2159,8 +2914,14 @@ define amdgpu_kernel void @private_singlethread_acq_rel_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_singlethread_acq_rel_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -2168,8 +2929,14 @@ define amdgpu_kernel void @private_singlethread_acq_rel_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_singlethread_acq_rel_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm @@ -2177,8 +2944,14 @@ define amdgpu_kernel void @private_singlethread_acq_rel_atomicrmw( ; GFX11-WGP-LABEL: private_singlethread_acq_rel_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm @@ -2186,8 +2959,14 @@ define amdgpu_kernel void @private_singlethread_acq_rel_atomicrmw( ; GFX11-CU-LABEL: private_singlethread_acq_rel_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm @@ -2195,8 +2974,14 @@ define amdgpu_kernel void @private_singlethread_acq_rel_atomicrmw( ; GFX12-WGP-LABEL: private_singlethread_acq_rel_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm @@ -2204,8 +2989,14 @@ define amdgpu_kernel void @private_singlethread_acq_rel_atomicrmw( ; GFX12-CU-LABEL: private_singlethread_acq_rel_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm @@ -2213,29 +3004,34 @@ define amdgpu_kernel void @private_singlethread_acq_rel_atomicrmw( ; GFX1250-LABEL: private_singlethread_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 ; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 @@ -2252,8 +3048,15 @@ define amdgpu_kernel void @private_singlethread_seq_cst_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -2264,8 +3067,15 @@ define amdgpu_kernel void @private_singlethread_seq_cst_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -2276,8 +3086,15 @@ define amdgpu_kernel void @private_singlethread_seq_cst_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -2288,8 +3105,15 @@ define amdgpu_kernel void @private_singlethread_seq_cst_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -2304,8 +3128,15 @@ define amdgpu_kernel void @private_singlethread_seq_cst_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -2316,8 +3147,15 @@ define amdgpu_kernel void @private_singlethread_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -2328,8 +3166,15 @@ define amdgpu_kernel void @private_singlethread_seq_cst_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -2338,8 +3183,14 @@ define amdgpu_kernel void @private_singlethread_seq_cst_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_singlethread_seq_cst_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -2347,8 +3198,14 @@ define amdgpu_kernel void @private_singlethread_seq_cst_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_singlethread_seq_cst_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm @@ -2356,8 +3213,14 @@ define amdgpu_kernel void @private_singlethread_seq_cst_atomicrmw( ; GFX11-WGP-LABEL: private_singlethread_seq_cst_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm @@ -2365,8 +3228,14 @@ define amdgpu_kernel void @private_singlethread_seq_cst_atomicrmw( ; GFX11-CU-LABEL: private_singlethread_seq_cst_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm @@ -2374,8 +3243,14 @@ define amdgpu_kernel void @private_singlethread_seq_cst_atomicrmw( ; GFX12-WGP-LABEL: private_singlethread_seq_cst_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm @@ -2383,8 +3258,14 @@ define amdgpu_kernel void @private_singlethread_seq_cst_atomicrmw( ; GFX12-CU-LABEL: private_singlethread_seq_cst_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm @@ -2392,29 +3273,34 @@ define amdgpu_kernel void @private_singlethread_seq_cst_atomicrmw( ; GFX1250-LABEL: private_singlethread_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 ; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 @@ -2431,6 +3317,10 @@ define amdgpu_kernel void @private_singlethread_acquire_ret_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -2449,6 +3339,10 @@ define amdgpu_kernel void @private_singlethread_acquire_ret_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -2466,6 +3360,10 @@ define amdgpu_kernel void @private_singlethread_acquire_ret_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -2483,6 +3381,10 @@ define amdgpu_kernel void @private_singlethread_acquire_ret_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -2504,6 +3406,10 @@ define amdgpu_kernel void @private_singlethread_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -2521,6 +3427,10 @@ define amdgpu_kernel void @private_singlethread_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -2538,6 +3448,10 @@ define amdgpu_kernel void @private_singlethread_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -2553,6 +3467,10 @@ define amdgpu_kernel void @private_singlethread_acquire_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_singlethread_acquire_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 @@ -2565,6 +3483,10 @@ define amdgpu_kernel void @private_singlethread_acquire_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_singlethread_acquire_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 @@ -2577,6 +3499,10 @@ define amdgpu_kernel void @private_singlethread_acquire_ret_atomicrmw( ; GFX11-WGP-LABEL: private_singlethread_acquire_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 @@ -2589,6 +3515,10 @@ define amdgpu_kernel void @private_singlethread_acquire_ret_atomicrmw( ; GFX11-CU-LABEL: private_singlethread_acquire_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 @@ -2601,6 +3531,10 @@ define amdgpu_kernel void @private_singlethread_acquire_ret_atomicrmw( ; GFX12-WGP-LABEL: private_singlethread_acquire_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 @@ -2613,6 +3547,10 @@ define amdgpu_kernel void @private_singlethread_acquire_ret_atomicrmw( ; GFX12-CU-LABEL: private_singlethread_acquire_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 @@ -2626,22 +3564,26 @@ define amdgpu_kernel void @private_singlethread_acquire_ret_atomicrmw( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_lg_u32 s0, s2 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b32 s2, 0 ; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s0, s3 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -2667,6 +3609,10 @@ define amdgpu_kernel void @private_singlethread_acq_rel_ret_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -2685,6 +3631,10 @@ define amdgpu_kernel void @private_singlethread_acq_rel_ret_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -2702,6 +3652,10 @@ define amdgpu_kernel void @private_singlethread_acq_rel_ret_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -2719,6 +3673,10 @@ define amdgpu_kernel void @private_singlethread_acq_rel_ret_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -2740,6 +3698,10 @@ define amdgpu_kernel void @private_singlethread_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -2757,6 +3719,10 @@ define amdgpu_kernel void @private_singlethread_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -2774,6 +3740,10 @@ define amdgpu_kernel void @private_singlethread_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -2789,6 +3759,10 @@ define amdgpu_kernel void @private_singlethread_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_singlethread_acq_rel_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 @@ -2801,6 +3775,10 @@ define amdgpu_kernel void @private_singlethread_acq_rel_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_singlethread_acq_rel_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 @@ -2813,6 +3791,10 @@ define amdgpu_kernel void @private_singlethread_acq_rel_ret_atomicrmw( ; GFX11-WGP-LABEL: private_singlethread_acq_rel_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 @@ -2825,6 +3807,10 @@ define amdgpu_kernel void @private_singlethread_acq_rel_ret_atomicrmw( ; GFX11-CU-LABEL: private_singlethread_acq_rel_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 @@ -2837,6 +3823,10 @@ define amdgpu_kernel void @private_singlethread_acq_rel_ret_atomicrmw( ; GFX12-WGP-LABEL: private_singlethread_acq_rel_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 @@ -2849,6 +3839,10 @@ define amdgpu_kernel void @private_singlethread_acq_rel_ret_atomicrmw( ; GFX12-CU-LABEL: private_singlethread_acq_rel_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 @@ -2862,22 +3856,26 @@ define amdgpu_kernel void @private_singlethread_acq_rel_ret_atomicrmw( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_lg_u32 s0, s2 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b32 s2, 0 ; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s0, s3 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -2903,6 +3901,10 @@ define amdgpu_kernel void @private_singlethread_seq_cst_ret_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -2921,6 +3923,10 @@ define amdgpu_kernel void @private_singlethread_seq_cst_ret_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -2938,6 +3944,10 @@ define amdgpu_kernel void @private_singlethread_seq_cst_ret_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -2955,6 +3965,10 @@ define amdgpu_kernel void @private_singlethread_seq_cst_ret_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -2976,6 +3990,10 @@ define amdgpu_kernel void @private_singlethread_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -2993,6 +4011,10 @@ define amdgpu_kernel void @private_singlethread_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -3010,6 +4032,10 @@ define amdgpu_kernel void @private_singlethread_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -3025,6 +4051,10 @@ define amdgpu_kernel void @private_singlethread_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_singlethread_seq_cst_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 @@ -3037,6 +4067,10 @@ define amdgpu_kernel void @private_singlethread_seq_cst_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_singlethread_seq_cst_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 @@ -3049,6 +4083,10 @@ define amdgpu_kernel void @private_singlethread_seq_cst_ret_atomicrmw( ; GFX11-WGP-LABEL: private_singlethread_seq_cst_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 @@ -3061,6 +4099,10 @@ define amdgpu_kernel void @private_singlethread_seq_cst_ret_atomicrmw( ; GFX11-CU-LABEL: private_singlethread_seq_cst_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 @@ -3073,6 +4115,10 @@ define amdgpu_kernel void @private_singlethread_seq_cst_ret_atomicrmw( ; GFX12-WGP-LABEL: private_singlethread_seq_cst_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 @@ -3085,6 +4131,10 @@ define amdgpu_kernel void @private_singlethread_seq_cst_ret_atomicrmw( ; GFX12-CU-LABEL: private_singlethread_seq_cst_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 @@ -3098,22 +4148,26 @@ define amdgpu_kernel void @private_singlethread_seq_cst_ret_atomicrmw( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_lg_u32 s0, s2 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b32 s2, 0 ; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s0, s3 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -3139,6 +4193,12 @@ define amdgpu_kernel void @private_singlethread_monotonic_monotonic_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -3159,6 +4219,12 @@ define amdgpu_kernel void @private_singlethread_monotonic_monotonic_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -3179,6 +4245,12 @@ define amdgpu_kernel void @private_singlethread_monotonic_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -3196,6 +4268,12 @@ define amdgpu_kernel void @private_singlethread_monotonic_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -3217,6 +4295,12 @@ define amdgpu_kernel void @private_singlethread_monotonic_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -3237,6 +4321,12 @@ define amdgpu_kernel void @private_singlethread_monotonic_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3255,6 +4345,12 @@ define amdgpu_kernel void @private_singlethread_monotonic_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3271,6 +4367,12 @@ define amdgpu_kernel void @private_singlethread_monotonic_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_singlethread_monotonic_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -3288,6 +4390,12 @@ define amdgpu_kernel void @private_singlethread_monotonic_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_singlethread_monotonic_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -3305,6 +4413,12 @@ define amdgpu_kernel void @private_singlethread_monotonic_monotonic_cmpxchg( ; GFX11-WGP-LABEL: private_singlethread_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -3320,6 +4434,12 @@ define amdgpu_kernel void @private_singlethread_monotonic_monotonic_cmpxchg( ; GFX11-CU-LABEL: private_singlethread_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -3335,6 +4455,12 @@ define amdgpu_kernel void @private_singlethread_monotonic_monotonic_cmpxchg( ; GFX12-WGP-LABEL: private_singlethread_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -3349,6 +4475,12 @@ define amdgpu_kernel void @private_singlethread_monotonic_monotonic_cmpxchg( ; GFX12-CU-LABEL: private_singlethread_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -3363,26 +4495,32 @@ define amdgpu_kernel void @private_singlethread_monotonic_monotonic_cmpxchg( ; GFX1250-LABEL: private_singlethread_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -3390,6 +4528,7 @@ define amdgpu_kernel void @private_singlethread_monotonic_monotonic_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -3409,6 +4548,12 @@ define amdgpu_kernel void @private_singlethread_acquire_monotonic_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -3429,6 +4574,12 @@ define amdgpu_kernel void @private_singlethread_acquire_monotonic_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -3449,6 +4600,12 @@ define amdgpu_kernel void @private_singlethread_acquire_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -3466,6 +4623,12 @@ define amdgpu_kernel void @private_singlethread_acquire_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -3487,6 +4650,12 @@ define amdgpu_kernel void @private_singlethread_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -3507,6 +4676,12 @@ define amdgpu_kernel void @private_singlethread_acquire_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3525,6 +4700,12 @@ define amdgpu_kernel void @private_singlethread_acquire_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3541,6 +4722,12 @@ define amdgpu_kernel void @private_singlethread_acquire_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_singlethread_acquire_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -3558,6 +4745,12 @@ define amdgpu_kernel void @private_singlethread_acquire_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_singlethread_acquire_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -3575,6 +4768,12 @@ define amdgpu_kernel void @private_singlethread_acquire_monotonic_cmpxchg( ; GFX11-WGP-LABEL: private_singlethread_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -3590,6 +4789,12 @@ define amdgpu_kernel void @private_singlethread_acquire_monotonic_cmpxchg( ; GFX11-CU-LABEL: private_singlethread_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -3605,6 +4810,12 @@ define amdgpu_kernel void @private_singlethread_acquire_monotonic_cmpxchg( ; GFX12-WGP-LABEL: private_singlethread_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -3619,6 +4830,12 @@ define amdgpu_kernel void @private_singlethread_acquire_monotonic_cmpxchg( ; GFX12-CU-LABEL: private_singlethread_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -3633,26 +4850,32 @@ define amdgpu_kernel void @private_singlethread_acquire_monotonic_cmpxchg( ; GFX1250-LABEL: private_singlethread_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -3660,6 +4883,7 @@ define amdgpu_kernel void @private_singlethread_acquire_monotonic_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -3679,6 +4903,12 @@ define amdgpu_kernel void @private_singlethread_release_monotonic_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -3699,6 +4929,12 @@ define amdgpu_kernel void @private_singlethread_release_monotonic_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -3719,6 +4955,12 @@ define amdgpu_kernel void @private_singlethread_release_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -3736,6 +4978,12 @@ define amdgpu_kernel void @private_singlethread_release_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -3757,6 +5005,12 @@ define amdgpu_kernel void @private_singlethread_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -3777,6 +5031,12 @@ define amdgpu_kernel void @private_singlethread_release_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3795,6 +5055,12 @@ define amdgpu_kernel void @private_singlethread_release_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3811,6 +5077,12 @@ define amdgpu_kernel void @private_singlethread_release_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_singlethread_release_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -3828,6 +5100,12 @@ define amdgpu_kernel void @private_singlethread_release_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_singlethread_release_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -3845,6 +5123,12 @@ define amdgpu_kernel void @private_singlethread_release_monotonic_cmpxchg( ; GFX11-WGP-LABEL: private_singlethread_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -3860,6 +5144,12 @@ define amdgpu_kernel void @private_singlethread_release_monotonic_cmpxchg( ; GFX11-CU-LABEL: private_singlethread_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -3875,6 +5165,12 @@ define amdgpu_kernel void @private_singlethread_release_monotonic_cmpxchg( ; GFX12-WGP-LABEL: private_singlethread_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -3889,6 +5185,12 @@ define amdgpu_kernel void @private_singlethread_release_monotonic_cmpxchg( ; GFX12-CU-LABEL: private_singlethread_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -3903,26 +5205,32 @@ define amdgpu_kernel void @private_singlethread_release_monotonic_cmpxchg( ; GFX1250-LABEL: private_singlethread_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -3930,6 +5238,7 @@ define amdgpu_kernel void @private_singlethread_release_monotonic_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -3949,6 +5258,12 @@ define amdgpu_kernel void @private_singlethread_acq_rel_monotonic_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -3969,6 +5284,12 @@ define amdgpu_kernel void @private_singlethread_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -3989,6 +5310,12 @@ define amdgpu_kernel void @private_singlethread_acq_rel_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4006,6 +5333,12 @@ define amdgpu_kernel void @private_singlethread_acq_rel_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4027,6 +5360,12 @@ define amdgpu_kernel void @private_singlethread_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -4047,6 +5386,12 @@ define amdgpu_kernel void @private_singlethread_acq_rel_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4065,6 +5410,12 @@ define amdgpu_kernel void @private_singlethread_acq_rel_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4081,6 +5432,12 @@ define amdgpu_kernel void @private_singlethread_acq_rel_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_singlethread_acq_rel_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -4098,6 +5455,12 @@ define amdgpu_kernel void @private_singlethread_acq_rel_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_singlethread_acq_rel_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -4115,6 +5478,12 @@ define amdgpu_kernel void @private_singlethread_acq_rel_monotonic_cmpxchg( ; GFX11-WGP-LABEL: private_singlethread_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -4130,6 +5499,12 @@ define amdgpu_kernel void @private_singlethread_acq_rel_monotonic_cmpxchg( ; GFX11-CU-LABEL: private_singlethread_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -4145,6 +5520,12 @@ define amdgpu_kernel void @private_singlethread_acq_rel_monotonic_cmpxchg( ; GFX12-WGP-LABEL: private_singlethread_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -4159,6 +5540,12 @@ define amdgpu_kernel void @private_singlethread_acq_rel_monotonic_cmpxchg( ; GFX12-CU-LABEL: private_singlethread_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -4173,26 +5560,32 @@ define amdgpu_kernel void @private_singlethread_acq_rel_monotonic_cmpxchg( ; GFX1250-LABEL: private_singlethread_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -4200,6 +5593,7 @@ define amdgpu_kernel void @private_singlethread_acq_rel_monotonic_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -4219,6 +5613,12 @@ define amdgpu_kernel void @private_singlethread_seq_cst_monotonic_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -4239,6 +5639,12 @@ define amdgpu_kernel void @private_singlethread_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -4259,6 +5665,12 @@ define amdgpu_kernel void @private_singlethread_seq_cst_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4276,6 +5688,12 @@ define amdgpu_kernel void @private_singlethread_seq_cst_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4297,6 +5715,12 @@ define amdgpu_kernel void @private_singlethread_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -4317,6 +5741,12 @@ define amdgpu_kernel void @private_singlethread_seq_cst_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4335,6 +5765,12 @@ define amdgpu_kernel void @private_singlethread_seq_cst_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4351,6 +5787,12 @@ define amdgpu_kernel void @private_singlethread_seq_cst_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_singlethread_seq_cst_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -4368,6 +5810,12 @@ define amdgpu_kernel void @private_singlethread_seq_cst_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_singlethread_seq_cst_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -4385,6 +5833,12 @@ define amdgpu_kernel void @private_singlethread_seq_cst_monotonic_cmpxchg( ; GFX11-WGP-LABEL: private_singlethread_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -4400,6 +5854,12 @@ define amdgpu_kernel void @private_singlethread_seq_cst_monotonic_cmpxchg( ; GFX11-CU-LABEL: private_singlethread_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -4415,6 +5875,12 @@ define amdgpu_kernel void @private_singlethread_seq_cst_monotonic_cmpxchg( ; GFX12-WGP-LABEL: private_singlethread_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -4429,6 +5895,12 @@ define amdgpu_kernel void @private_singlethread_seq_cst_monotonic_cmpxchg( ; GFX12-CU-LABEL: private_singlethread_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -4443,26 +5915,32 @@ define amdgpu_kernel void @private_singlethread_seq_cst_monotonic_cmpxchg( ; GFX1250-LABEL: private_singlethread_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -4470,6 +5948,7 @@ define amdgpu_kernel void @private_singlethread_seq_cst_monotonic_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -4489,6 +5968,12 @@ define amdgpu_kernel void @private_singlethread_monotonic_acquire_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -4509,6 +5994,12 @@ define amdgpu_kernel void @private_singlethread_monotonic_acquire_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -4529,6 +6020,12 @@ define amdgpu_kernel void @private_singlethread_monotonic_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4546,6 +6043,12 @@ define amdgpu_kernel void @private_singlethread_monotonic_acquire_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4567,6 +6070,12 @@ define amdgpu_kernel void @private_singlethread_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -4587,6 +6096,12 @@ define amdgpu_kernel void @private_singlethread_monotonic_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4605,6 +6120,12 @@ define amdgpu_kernel void @private_singlethread_monotonic_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4621,6 +6142,12 @@ define amdgpu_kernel void @private_singlethread_monotonic_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_singlethread_monotonic_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -4638,6 +6165,12 @@ define amdgpu_kernel void @private_singlethread_monotonic_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_singlethread_monotonic_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -4655,6 +6188,12 @@ define amdgpu_kernel void @private_singlethread_monotonic_acquire_cmpxchg( ; GFX11-WGP-LABEL: private_singlethread_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -4670,6 +6209,12 @@ define amdgpu_kernel void @private_singlethread_monotonic_acquire_cmpxchg( ; GFX11-CU-LABEL: private_singlethread_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -4685,6 +6230,12 @@ define amdgpu_kernel void @private_singlethread_monotonic_acquire_cmpxchg( ; GFX12-WGP-LABEL: private_singlethread_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -4699,6 +6250,12 @@ define amdgpu_kernel void @private_singlethread_monotonic_acquire_cmpxchg( ; GFX12-CU-LABEL: private_singlethread_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -4713,26 +6270,32 @@ define amdgpu_kernel void @private_singlethread_monotonic_acquire_cmpxchg( ; GFX1250-LABEL: private_singlethread_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -4740,6 +6303,7 @@ define amdgpu_kernel void @private_singlethread_monotonic_acquire_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -4759,6 +6323,12 @@ define amdgpu_kernel void @private_singlethread_acquire_acquire_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -4779,6 +6349,12 @@ define amdgpu_kernel void @private_singlethread_acquire_acquire_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -4799,6 +6375,12 @@ define amdgpu_kernel void @private_singlethread_acquire_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4816,6 +6398,12 @@ define amdgpu_kernel void @private_singlethread_acquire_acquire_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4837,6 +6425,12 @@ define amdgpu_kernel void @private_singlethread_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -4857,6 +6451,12 @@ define amdgpu_kernel void @private_singlethread_acquire_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4875,6 +6475,12 @@ define amdgpu_kernel void @private_singlethread_acquire_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4891,6 +6497,12 @@ define amdgpu_kernel void @private_singlethread_acquire_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_singlethread_acquire_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -4908,6 +6520,12 @@ define amdgpu_kernel void @private_singlethread_acquire_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_singlethread_acquire_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -4925,6 +6543,12 @@ define amdgpu_kernel void @private_singlethread_acquire_acquire_cmpxchg( ; GFX11-WGP-LABEL: private_singlethread_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -4940,6 +6564,12 @@ define amdgpu_kernel void @private_singlethread_acquire_acquire_cmpxchg( ; GFX11-CU-LABEL: private_singlethread_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -4955,6 +6585,12 @@ define amdgpu_kernel void @private_singlethread_acquire_acquire_cmpxchg( ; GFX12-WGP-LABEL: private_singlethread_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -4969,6 +6605,12 @@ define amdgpu_kernel void @private_singlethread_acquire_acquire_cmpxchg( ; GFX12-CU-LABEL: private_singlethread_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -4983,26 +6625,32 @@ define amdgpu_kernel void @private_singlethread_acquire_acquire_cmpxchg( ; GFX1250-LABEL: private_singlethread_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -5010,6 +6658,7 @@ define amdgpu_kernel void @private_singlethread_acquire_acquire_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -5029,6 +6678,12 @@ define amdgpu_kernel void @private_singlethread_release_acquire_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -5049,6 +6704,12 @@ define amdgpu_kernel void @private_singlethread_release_acquire_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -5069,6 +6730,12 @@ define amdgpu_kernel void @private_singlethread_release_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5086,6 +6753,12 @@ define amdgpu_kernel void @private_singlethread_release_acquire_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5107,6 +6780,12 @@ define amdgpu_kernel void @private_singlethread_release_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -5127,6 +6806,12 @@ define amdgpu_kernel void @private_singlethread_release_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5145,6 +6830,12 @@ define amdgpu_kernel void @private_singlethread_release_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5161,6 +6852,12 @@ define amdgpu_kernel void @private_singlethread_release_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_singlethread_release_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -5178,6 +6875,12 @@ define amdgpu_kernel void @private_singlethread_release_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_singlethread_release_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -5195,6 +6898,12 @@ define amdgpu_kernel void @private_singlethread_release_acquire_cmpxchg( ; GFX11-WGP-LABEL: private_singlethread_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -5210,6 +6919,12 @@ define amdgpu_kernel void @private_singlethread_release_acquire_cmpxchg( ; GFX11-CU-LABEL: private_singlethread_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -5225,6 +6940,12 @@ define amdgpu_kernel void @private_singlethread_release_acquire_cmpxchg( ; GFX12-WGP-LABEL: private_singlethread_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -5239,6 +6960,12 @@ define amdgpu_kernel void @private_singlethread_release_acquire_cmpxchg( ; GFX12-CU-LABEL: private_singlethread_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -5253,26 +6980,32 @@ define amdgpu_kernel void @private_singlethread_release_acquire_cmpxchg( ; GFX1250-LABEL: private_singlethread_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -5280,6 +7013,7 @@ define amdgpu_kernel void @private_singlethread_release_acquire_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -5299,6 +7033,12 @@ define amdgpu_kernel void @private_singlethread_acq_rel_acquire_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -5319,6 +7059,12 @@ define amdgpu_kernel void @private_singlethread_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -5339,6 +7085,12 @@ define amdgpu_kernel void @private_singlethread_acq_rel_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5356,6 +7108,12 @@ define amdgpu_kernel void @private_singlethread_acq_rel_acquire_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5377,6 +7135,12 @@ define amdgpu_kernel void @private_singlethread_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -5397,6 +7161,12 @@ define amdgpu_kernel void @private_singlethread_acq_rel_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5415,6 +7185,12 @@ define amdgpu_kernel void @private_singlethread_acq_rel_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5431,6 +7207,12 @@ define amdgpu_kernel void @private_singlethread_acq_rel_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_singlethread_acq_rel_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -5448,6 +7230,12 @@ define amdgpu_kernel void @private_singlethread_acq_rel_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_singlethread_acq_rel_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -5465,6 +7253,12 @@ define amdgpu_kernel void @private_singlethread_acq_rel_acquire_cmpxchg( ; GFX11-WGP-LABEL: private_singlethread_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -5480,6 +7274,12 @@ define amdgpu_kernel void @private_singlethread_acq_rel_acquire_cmpxchg( ; GFX11-CU-LABEL: private_singlethread_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -5495,6 +7295,12 @@ define amdgpu_kernel void @private_singlethread_acq_rel_acquire_cmpxchg( ; GFX12-WGP-LABEL: private_singlethread_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -5509,6 +7315,12 @@ define amdgpu_kernel void @private_singlethread_acq_rel_acquire_cmpxchg( ; GFX12-CU-LABEL: private_singlethread_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -5523,26 +7335,32 @@ define amdgpu_kernel void @private_singlethread_acq_rel_acquire_cmpxchg( ; GFX1250-LABEL: private_singlethread_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -5550,6 +7368,7 @@ define amdgpu_kernel void @private_singlethread_acq_rel_acquire_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -5569,6 +7388,12 @@ define amdgpu_kernel void @private_singlethread_seq_cst_acquire_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -5589,6 +7414,12 @@ define amdgpu_kernel void @private_singlethread_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -5609,6 +7440,12 @@ define amdgpu_kernel void @private_singlethread_seq_cst_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5626,6 +7463,12 @@ define amdgpu_kernel void @private_singlethread_seq_cst_acquire_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5647,6 +7490,12 @@ define amdgpu_kernel void @private_singlethread_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -5667,6 +7516,12 @@ define amdgpu_kernel void @private_singlethread_seq_cst_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5685,6 +7540,12 @@ define amdgpu_kernel void @private_singlethread_seq_cst_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5701,6 +7562,12 @@ define amdgpu_kernel void @private_singlethread_seq_cst_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_singlethread_seq_cst_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -5718,6 +7585,12 @@ define amdgpu_kernel void @private_singlethread_seq_cst_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_singlethread_seq_cst_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -5735,6 +7608,12 @@ define amdgpu_kernel void @private_singlethread_seq_cst_acquire_cmpxchg( ; GFX11-WGP-LABEL: private_singlethread_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -5750,6 +7629,12 @@ define amdgpu_kernel void @private_singlethread_seq_cst_acquire_cmpxchg( ; GFX11-CU-LABEL: private_singlethread_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -5765,6 +7650,12 @@ define amdgpu_kernel void @private_singlethread_seq_cst_acquire_cmpxchg( ; GFX12-WGP-LABEL: private_singlethread_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -5779,6 +7670,12 @@ define amdgpu_kernel void @private_singlethread_seq_cst_acquire_cmpxchg( ; GFX12-CU-LABEL: private_singlethread_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -5793,26 +7690,32 @@ define amdgpu_kernel void @private_singlethread_seq_cst_acquire_cmpxchg( ; GFX1250-LABEL: private_singlethread_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -5820,6 +7723,7 @@ define amdgpu_kernel void @private_singlethread_seq_cst_acquire_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -5839,6 +7743,12 @@ define amdgpu_kernel void @private_singlethread_monotonic_seq_cst_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -5859,6 +7769,12 @@ define amdgpu_kernel void @private_singlethread_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -5879,6 +7795,12 @@ define amdgpu_kernel void @private_singlethread_monotonic_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5896,6 +7818,12 @@ define amdgpu_kernel void @private_singlethread_monotonic_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5917,6 +7845,12 @@ define amdgpu_kernel void @private_singlethread_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -5937,6 +7871,12 @@ define amdgpu_kernel void @private_singlethread_monotonic_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5955,6 +7895,12 @@ define amdgpu_kernel void @private_singlethread_monotonic_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5971,6 +7917,12 @@ define amdgpu_kernel void @private_singlethread_monotonic_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_singlethread_monotonic_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -5988,6 +7940,12 @@ define amdgpu_kernel void @private_singlethread_monotonic_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_singlethread_monotonic_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -6005,6 +7963,12 @@ define amdgpu_kernel void @private_singlethread_monotonic_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: private_singlethread_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -6020,6 +7984,12 @@ define amdgpu_kernel void @private_singlethread_monotonic_seq_cst_cmpxchg( ; GFX11-CU-LABEL: private_singlethread_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -6035,6 +8005,12 @@ define amdgpu_kernel void @private_singlethread_monotonic_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: private_singlethread_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -6049,6 +8025,12 @@ define amdgpu_kernel void @private_singlethread_monotonic_seq_cst_cmpxchg( ; GFX12-CU-LABEL: private_singlethread_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -6063,26 +8045,32 @@ define amdgpu_kernel void @private_singlethread_monotonic_seq_cst_cmpxchg( ; GFX1250-LABEL: private_singlethread_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -6090,6 +8078,7 @@ define amdgpu_kernel void @private_singlethread_monotonic_seq_cst_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -6109,6 +8098,12 @@ define amdgpu_kernel void @private_singlethread_acquire_seq_cst_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -6129,6 +8124,12 @@ define amdgpu_kernel void @private_singlethread_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -6149,6 +8150,12 @@ define amdgpu_kernel void @private_singlethread_acquire_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -6166,6 +8173,12 @@ define amdgpu_kernel void @private_singlethread_acquire_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6187,6 +8200,12 @@ define amdgpu_kernel void @private_singlethread_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -6207,6 +8226,12 @@ define amdgpu_kernel void @private_singlethread_acquire_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6225,6 +8250,12 @@ define amdgpu_kernel void @private_singlethread_acquire_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6241,6 +8272,12 @@ define amdgpu_kernel void @private_singlethread_acquire_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_singlethread_acquire_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -6258,6 +8295,12 @@ define amdgpu_kernel void @private_singlethread_acquire_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_singlethread_acquire_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -6275,6 +8318,12 @@ define amdgpu_kernel void @private_singlethread_acquire_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: private_singlethread_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -6290,6 +8339,12 @@ define amdgpu_kernel void @private_singlethread_acquire_seq_cst_cmpxchg( ; GFX11-CU-LABEL: private_singlethread_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -6305,6 +8360,12 @@ define amdgpu_kernel void @private_singlethread_acquire_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: private_singlethread_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -6319,6 +8380,12 @@ define amdgpu_kernel void @private_singlethread_acquire_seq_cst_cmpxchg( ; GFX12-CU-LABEL: private_singlethread_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -6333,26 +8400,32 @@ define amdgpu_kernel void @private_singlethread_acquire_seq_cst_cmpxchg( ; GFX1250-LABEL: private_singlethread_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -6360,6 +8433,7 @@ define amdgpu_kernel void @private_singlethread_acquire_seq_cst_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -6379,6 +8453,12 @@ define amdgpu_kernel void @private_singlethread_release_seq_cst_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -6399,6 +8479,12 @@ define amdgpu_kernel void @private_singlethread_release_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -6419,6 +8505,12 @@ define amdgpu_kernel void @private_singlethread_release_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -6436,6 +8528,12 @@ define amdgpu_kernel void @private_singlethread_release_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6457,6 +8555,12 @@ define amdgpu_kernel void @private_singlethread_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -6477,6 +8581,12 @@ define amdgpu_kernel void @private_singlethread_release_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6495,6 +8605,12 @@ define amdgpu_kernel void @private_singlethread_release_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6511,6 +8627,12 @@ define amdgpu_kernel void @private_singlethread_release_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_singlethread_release_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -6528,6 +8650,12 @@ define amdgpu_kernel void @private_singlethread_release_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_singlethread_release_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -6545,6 +8673,12 @@ define amdgpu_kernel void @private_singlethread_release_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: private_singlethread_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -6560,6 +8694,12 @@ define amdgpu_kernel void @private_singlethread_release_seq_cst_cmpxchg( ; GFX11-CU-LABEL: private_singlethread_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -6575,6 +8715,12 @@ define amdgpu_kernel void @private_singlethread_release_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: private_singlethread_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -6589,6 +8735,12 @@ define amdgpu_kernel void @private_singlethread_release_seq_cst_cmpxchg( ; GFX12-CU-LABEL: private_singlethread_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -6603,26 +8755,32 @@ define amdgpu_kernel void @private_singlethread_release_seq_cst_cmpxchg( ; GFX1250-LABEL: private_singlethread_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -6630,6 +8788,7 @@ define amdgpu_kernel void @private_singlethread_release_seq_cst_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -6649,6 +8808,12 @@ define amdgpu_kernel void @private_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -6669,6 +8834,12 @@ define amdgpu_kernel void @private_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -6689,6 +8860,12 @@ define amdgpu_kernel void @private_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -6706,6 +8883,12 @@ define amdgpu_kernel void @private_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6727,6 +8910,12 @@ define amdgpu_kernel void @private_singlethread_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -6747,6 +8936,12 @@ define amdgpu_kernel void @private_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6765,6 +8960,12 @@ define amdgpu_kernel void @private_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6781,6 +8982,12 @@ define amdgpu_kernel void @private_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -6798,6 +9005,12 @@ define amdgpu_kernel void @private_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -6815,6 +9028,12 @@ define amdgpu_kernel void @private_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: private_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -6830,6 +9049,12 @@ define amdgpu_kernel void @private_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX11-CU-LABEL: private_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -6845,6 +9070,12 @@ define amdgpu_kernel void @private_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: private_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -6859,6 +9090,12 @@ define amdgpu_kernel void @private_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-LABEL: private_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -6873,26 +9110,32 @@ define amdgpu_kernel void @private_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX1250-LABEL: private_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -6900,6 +9143,7 @@ define amdgpu_kernel void @private_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -6919,6 +9163,12 @@ define amdgpu_kernel void @private_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -6939,6 +9189,12 @@ define amdgpu_kernel void @private_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -6959,6 +9215,12 @@ define amdgpu_kernel void @private_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -6976,6 +9238,12 @@ define amdgpu_kernel void @private_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6997,6 +9265,12 @@ define amdgpu_kernel void @private_singlethread_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -7017,6 +9291,12 @@ define amdgpu_kernel void @private_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7035,6 +9315,12 @@ define amdgpu_kernel void @private_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7051,6 +9337,12 @@ define amdgpu_kernel void @private_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -7068,6 +9360,12 @@ define amdgpu_kernel void @private_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -7085,6 +9383,12 @@ define amdgpu_kernel void @private_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: private_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -7100,6 +9404,12 @@ define amdgpu_kernel void @private_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX11-CU-LABEL: private_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -7115,6 +9425,12 @@ define amdgpu_kernel void @private_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: private_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -7129,6 +9445,12 @@ define amdgpu_kernel void @private_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-LABEL: private_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -7143,26 +9465,32 @@ define amdgpu_kernel void @private_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX1250-LABEL: private_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -7170,6 +9498,7 @@ define amdgpu_kernel void @private_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -7190,6 +9519,12 @@ define amdgpu_kernel void @private_singlethread_monotonic_monotonic_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -7214,6 +9549,12 @@ define amdgpu_kernel void @private_singlethread_monotonic_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -7236,6 +9577,12 @@ define amdgpu_kernel void @private_singlethread_monotonic_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7255,6 +9602,12 @@ define amdgpu_kernel void @private_singlethread_monotonic_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -7279,6 +9632,12 @@ define amdgpu_kernel void @private_singlethread_monotonic_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -7301,6 +9660,12 @@ define amdgpu_kernel void @private_singlethread_monotonic_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7321,6 +9686,12 @@ define amdgpu_kernel void @private_singlethread_monotonic_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7340,6 +9711,12 @@ define amdgpu_kernel void @private_singlethread_monotonic_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -7360,6 +9737,12 @@ define amdgpu_kernel void @private_singlethread_monotonic_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -7379,6 +9762,12 @@ define amdgpu_kernel void @private_singlethread_monotonic_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: private_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -7395,6 +9784,12 @@ define amdgpu_kernel void @private_singlethread_monotonic_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: private_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -7411,6 +9806,12 @@ define amdgpu_kernel void @private_singlethread_monotonic_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: private_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -7426,6 +9827,12 @@ define amdgpu_kernel void @private_singlethread_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: private_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -7442,25 +9849,31 @@ define amdgpu_kernel void @private_singlethread_monotonic_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -7492,6 +9905,12 @@ define amdgpu_kernel void @private_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -7516,6 +9935,12 @@ define amdgpu_kernel void @private_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -7538,6 +9963,12 @@ define amdgpu_kernel void @private_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7557,6 +9988,12 @@ define amdgpu_kernel void @private_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -7581,6 +10018,12 @@ define amdgpu_kernel void @private_singlethread_acquire_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -7603,6 +10046,12 @@ define amdgpu_kernel void @private_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7623,6 +10072,12 @@ define amdgpu_kernel void @private_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7642,6 +10097,12 @@ define amdgpu_kernel void @private_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -7662,6 +10123,12 @@ define amdgpu_kernel void @private_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -7681,6 +10148,12 @@ define amdgpu_kernel void @private_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: private_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -7697,6 +10170,12 @@ define amdgpu_kernel void @private_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: private_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -7713,6 +10192,12 @@ define amdgpu_kernel void @private_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: private_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -7728,6 +10213,12 @@ define amdgpu_kernel void @private_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: private_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -7744,25 +10235,31 @@ define amdgpu_kernel void @private_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -7794,6 +10291,12 @@ define amdgpu_kernel void @private_singlethread_release_monotonic_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -7818,6 +10321,12 @@ define amdgpu_kernel void @private_singlethread_release_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -7840,6 +10349,12 @@ define amdgpu_kernel void @private_singlethread_release_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7859,6 +10374,12 @@ define amdgpu_kernel void @private_singlethread_release_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -7883,6 +10404,12 @@ define amdgpu_kernel void @private_singlethread_release_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -7905,6 +10432,12 @@ define amdgpu_kernel void @private_singlethread_release_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7925,6 +10458,12 @@ define amdgpu_kernel void @private_singlethread_release_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7944,6 +10483,12 @@ define amdgpu_kernel void @private_singlethread_release_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -7964,6 +10509,12 @@ define amdgpu_kernel void @private_singlethread_release_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -7983,6 +10534,12 @@ define amdgpu_kernel void @private_singlethread_release_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: private_singlethread_release_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -7999,6 +10556,12 @@ define amdgpu_kernel void @private_singlethread_release_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: private_singlethread_release_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -8015,6 +10578,12 @@ define amdgpu_kernel void @private_singlethread_release_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: private_singlethread_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -8030,6 +10599,12 @@ define amdgpu_kernel void @private_singlethread_release_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: private_singlethread_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -8046,25 +10621,31 @@ define amdgpu_kernel void @private_singlethread_release_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -8096,6 +10677,12 @@ define amdgpu_kernel void @private_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -8120,6 +10707,12 @@ define amdgpu_kernel void @private_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -8142,6 +10735,12 @@ define amdgpu_kernel void @private_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -8161,6 +10760,12 @@ define amdgpu_kernel void @private_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -8185,6 +10790,12 @@ define amdgpu_kernel void @private_singlethread_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -8207,6 +10818,12 @@ define amdgpu_kernel void @private_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8227,6 +10844,12 @@ define amdgpu_kernel void @private_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8246,6 +10869,12 @@ define amdgpu_kernel void @private_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -8266,6 +10895,12 @@ define amdgpu_kernel void @private_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -8285,6 +10920,12 @@ define amdgpu_kernel void @private_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: private_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -8301,6 +10942,12 @@ define amdgpu_kernel void @private_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: private_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -8317,6 +10964,12 @@ define amdgpu_kernel void @private_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: private_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -8332,6 +10985,12 @@ define amdgpu_kernel void @private_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: private_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -8348,25 +11007,31 @@ define amdgpu_kernel void @private_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -8398,6 +11063,12 @@ define amdgpu_kernel void @private_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -8422,6 +11093,12 @@ define amdgpu_kernel void @private_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -8444,6 +11121,12 @@ define amdgpu_kernel void @private_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -8463,6 +11146,12 @@ define amdgpu_kernel void @private_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -8487,6 +11176,12 @@ define amdgpu_kernel void @private_singlethread_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -8509,6 +11204,12 @@ define amdgpu_kernel void @private_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8529,6 +11230,12 @@ define amdgpu_kernel void @private_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8548,6 +11255,12 @@ define amdgpu_kernel void @private_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -8568,6 +11281,12 @@ define amdgpu_kernel void @private_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -8587,6 +11306,12 @@ define amdgpu_kernel void @private_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: private_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -8603,6 +11328,12 @@ define amdgpu_kernel void @private_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: private_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -8619,6 +11350,12 @@ define amdgpu_kernel void @private_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: private_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -8634,6 +11371,12 @@ define amdgpu_kernel void @private_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: private_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -8650,25 +11393,31 @@ define amdgpu_kernel void @private_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -8700,6 +11449,12 @@ define amdgpu_kernel void @private_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -8724,6 +11479,12 @@ define amdgpu_kernel void @private_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -8746,6 +11507,12 @@ define amdgpu_kernel void @private_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -8765,6 +11532,12 @@ define amdgpu_kernel void @private_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -8789,6 +11562,12 @@ define amdgpu_kernel void @private_singlethread_monotonic_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -8811,6 +11590,12 @@ define amdgpu_kernel void @private_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8831,6 +11616,12 @@ define amdgpu_kernel void @private_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8850,6 +11641,12 @@ define amdgpu_kernel void @private_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -8870,6 +11667,12 @@ define amdgpu_kernel void @private_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -8889,6 +11692,12 @@ define amdgpu_kernel void @private_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: private_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -8905,6 +11714,12 @@ define amdgpu_kernel void @private_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: private_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -8921,6 +11736,12 @@ define amdgpu_kernel void @private_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: private_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -8936,6 +11757,12 @@ define amdgpu_kernel void @private_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: private_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -8952,25 +11779,31 @@ define amdgpu_kernel void @private_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -9002,6 +11835,12 @@ define amdgpu_kernel void @private_singlethread_acquire_acquire_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -9026,6 +11865,12 @@ define amdgpu_kernel void @private_singlethread_acquire_acquire_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -9048,6 +11893,12 @@ define amdgpu_kernel void @private_singlethread_acquire_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -9067,6 +11918,12 @@ define amdgpu_kernel void @private_singlethread_acquire_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -9091,6 +11948,12 @@ define amdgpu_kernel void @private_singlethread_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -9113,6 +11976,12 @@ define amdgpu_kernel void @private_singlethread_acquire_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9133,6 +12002,12 @@ define amdgpu_kernel void @private_singlethread_acquire_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9152,6 +12027,12 @@ define amdgpu_kernel void @private_singlethread_acquire_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -9172,6 +12053,12 @@ define amdgpu_kernel void @private_singlethread_acquire_acquire_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -9191,6 +12078,12 @@ define amdgpu_kernel void @private_singlethread_acquire_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: private_singlethread_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -9207,6 +12100,12 @@ define amdgpu_kernel void @private_singlethread_acquire_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: private_singlethread_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -9223,6 +12122,12 @@ define amdgpu_kernel void @private_singlethread_acquire_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: private_singlethread_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -9238,6 +12143,12 @@ define amdgpu_kernel void @private_singlethread_acquire_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: private_singlethread_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -9254,25 +12165,31 @@ define amdgpu_kernel void @private_singlethread_acquire_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -9304,6 +12221,12 @@ define amdgpu_kernel void @private_singlethread_release_acquire_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -9328,6 +12251,12 @@ define amdgpu_kernel void @private_singlethread_release_acquire_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -9350,6 +12279,12 @@ define amdgpu_kernel void @private_singlethread_release_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -9369,6 +12304,12 @@ define amdgpu_kernel void @private_singlethread_release_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -9393,6 +12334,12 @@ define amdgpu_kernel void @private_singlethread_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -9415,6 +12362,12 @@ define amdgpu_kernel void @private_singlethread_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9435,6 +12388,12 @@ define amdgpu_kernel void @private_singlethread_release_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9454,6 +12413,12 @@ define amdgpu_kernel void @private_singlethread_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -9474,6 +12439,12 @@ define amdgpu_kernel void @private_singlethread_release_acquire_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -9493,6 +12464,12 @@ define amdgpu_kernel void @private_singlethread_release_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: private_singlethread_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -9509,6 +12486,12 @@ define amdgpu_kernel void @private_singlethread_release_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: private_singlethread_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -9525,6 +12508,12 @@ define amdgpu_kernel void @private_singlethread_release_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: private_singlethread_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -9540,6 +12529,12 @@ define amdgpu_kernel void @private_singlethread_release_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: private_singlethread_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -9556,25 +12551,31 @@ define amdgpu_kernel void @private_singlethread_release_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -9606,6 +12607,12 @@ define amdgpu_kernel void @private_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -9630,6 +12637,12 @@ define amdgpu_kernel void @private_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -9652,6 +12665,12 @@ define amdgpu_kernel void @private_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -9671,6 +12690,12 @@ define amdgpu_kernel void @private_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -9695,6 +12720,12 @@ define amdgpu_kernel void @private_singlethread_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -9717,6 +12748,12 @@ define amdgpu_kernel void @private_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9737,6 +12774,12 @@ define amdgpu_kernel void @private_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9756,6 +12799,12 @@ define amdgpu_kernel void @private_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -9776,6 +12825,12 @@ define amdgpu_kernel void @private_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -9795,6 +12850,12 @@ define amdgpu_kernel void @private_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: private_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -9811,6 +12872,12 @@ define amdgpu_kernel void @private_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: private_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -9827,6 +12894,12 @@ define amdgpu_kernel void @private_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: private_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -9842,6 +12915,12 @@ define amdgpu_kernel void @private_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: private_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -9858,25 +12937,31 @@ define amdgpu_kernel void @private_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -9908,6 +12993,12 @@ define amdgpu_kernel void @private_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -9932,6 +13023,12 @@ define amdgpu_kernel void @private_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -9954,6 +13051,12 @@ define amdgpu_kernel void @private_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -9973,6 +13076,12 @@ define amdgpu_kernel void @private_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -9997,6 +13106,12 @@ define amdgpu_kernel void @private_singlethread_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -10019,6 +13134,12 @@ define amdgpu_kernel void @private_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10039,6 +13160,12 @@ define amdgpu_kernel void @private_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10058,6 +13185,12 @@ define amdgpu_kernel void @private_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -10078,6 +13211,12 @@ define amdgpu_kernel void @private_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -10097,6 +13236,12 @@ define amdgpu_kernel void @private_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: private_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -10113,6 +13258,12 @@ define amdgpu_kernel void @private_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: private_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -10129,6 +13280,12 @@ define amdgpu_kernel void @private_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: private_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -10144,6 +13301,12 @@ define amdgpu_kernel void @private_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: private_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -10160,25 +13323,31 @@ define amdgpu_kernel void @private_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -10210,6 +13379,12 @@ define amdgpu_kernel void @private_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -10234,6 +13409,12 @@ define amdgpu_kernel void @private_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -10256,6 +13437,12 @@ define amdgpu_kernel void @private_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10275,6 +13462,12 @@ define amdgpu_kernel void @private_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10299,6 +13492,12 @@ define amdgpu_kernel void @private_singlethread_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -10321,6 +13520,12 @@ define amdgpu_kernel void @private_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10341,6 +13546,12 @@ define amdgpu_kernel void @private_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10360,6 +13571,12 @@ define amdgpu_kernel void @private_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -10380,6 +13597,12 @@ define amdgpu_kernel void @private_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -10399,6 +13622,12 @@ define amdgpu_kernel void @private_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: private_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -10415,6 +13644,12 @@ define amdgpu_kernel void @private_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: private_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -10431,6 +13666,12 @@ define amdgpu_kernel void @private_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: private_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -10446,6 +13687,12 @@ define amdgpu_kernel void @private_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: private_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -10462,25 +13709,31 @@ define amdgpu_kernel void @private_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -10512,6 +13765,12 @@ define amdgpu_kernel void @private_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -10536,6 +13795,12 @@ define amdgpu_kernel void @private_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -10558,6 +13823,12 @@ define amdgpu_kernel void @private_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10577,6 +13848,12 @@ define amdgpu_kernel void @private_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10601,6 +13878,12 @@ define amdgpu_kernel void @private_singlethread_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -10623,6 +13906,12 @@ define amdgpu_kernel void @private_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10643,6 +13932,12 @@ define amdgpu_kernel void @private_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10662,6 +13957,12 @@ define amdgpu_kernel void @private_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -10682,6 +13983,12 @@ define amdgpu_kernel void @private_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -10701,6 +14008,12 @@ define amdgpu_kernel void @private_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: private_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -10717,6 +14030,12 @@ define amdgpu_kernel void @private_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: private_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -10733,6 +14052,12 @@ define amdgpu_kernel void @private_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: private_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -10748,6 +14073,12 @@ define amdgpu_kernel void @private_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: private_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -10764,25 +14095,31 @@ define amdgpu_kernel void @private_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -10814,6 +14151,12 @@ define amdgpu_kernel void @private_singlethread_release_seq_cst_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -10838,6 +14181,12 @@ define amdgpu_kernel void @private_singlethread_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -10860,6 +14209,12 @@ define amdgpu_kernel void @private_singlethread_release_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10879,6 +14234,12 @@ define amdgpu_kernel void @private_singlethread_release_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10903,6 +14264,12 @@ define amdgpu_kernel void @private_singlethread_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -10925,6 +14292,12 @@ define amdgpu_kernel void @private_singlethread_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10945,6 +14318,12 @@ define amdgpu_kernel void @private_singlethread_release_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10964,6 +14343,12 @@ define amdgpu_kernel void @private_singlethread_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -10984,6 +14369,12 @@ define amdgpu_kernel void @private_singlethread_release_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -11003,6 +14394,12 @@ define amdgpu_kernel void @private_singlethread_release_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: private_singlethread_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -11019,6 +14416,12 @@ define amdgpu_kernel void @private_singlethread_release_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: private_singlethread_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -11035,6 +14438,12 @@ define amdgpu_kernel void @private_singlethread_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: private_singlethread_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -11050,6 +14459,12 @@ define amdgpu_kernel void @private_singlethread_release_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: private_singlethread_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -11066,25 +14481,31 @@ define amdgpu_kernel void @private_singlethread_release_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -11116,6 +14537,12 @@ define amdgpu_kernel void @private_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -11140,6 +14567,12 @@ define amdgpu_kernel void @private_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -11162,6 +14595,12 @@ define amdgpu_kernel void @private_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11181,6 +14620,12 @@ define amdgpu_kernel void @private_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11205,6 +14650,12 @@ define amdgpu_kernel void @private_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -11227,6 +14678,12 @@ define amdgpu_kernel void @private_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11247,6 +14704,12 @@ define amdgpu_kernel void @private_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11266,6 +14729,12 @@ define amdgpu_kernel void @private_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -11286,6 +14755,12 @@ define amdgpu_kernel void @private_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -11305,6 +14780,12 @@ define amdgpu_kernel void @private_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: private_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -11321,6 +14802,12 @@ define amdgpu_kernel void @private_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: private_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -11337,6 +14824,12 @@ define amdgpu_kernel void @private_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: private_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -11352,6 +14845,12 @@ define amdgpu_kernel void @private_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: private_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -11368,25 +14867,31 @@ define amdgpu_kernel void @private_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -11418,6 +14923,12 @@ define amdgpu_kernel void @private_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -11442,6 +14953,12 @@ define amdgpu_kernel void @private_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -11464,6 +14981,12 @@ define amdgpu_kernel void @private_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11483,6 +15006,12 @@ define amdgpu_kernel void @private_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11507,6 +15036,12 @@ define amdgpu_kernel void @private_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -11529,6 +15064,12 @@ define amdgpu_kernel void @private_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11549,6 +15090,12 @@ define amdgpu_kernel void @private_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11568,6 +15115,12 @@ define amdgpu_kernel void @private_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -11588,6 +15141,12 @@ define amdgpu_kernel void @private_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -11607,6 +15166,12 @@ define amdgpu_kernel void @private_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: private_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -11623,6 +15188,12 @@ define amdgpu_kernel void @private_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: private_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -11639,6 +15210,12 @@ define amdgpu_kernel void @private_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: private_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -11654,6 +15231,12 @@ define amdgpu_kernel void @private_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: private_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -11670,25 +15253,31 @@ define amdgpu_kernel void @private_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -11718,11 +15307,15 @@ define amdgpu_kernel void @private_singlethread_one_as_unordered_load( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -11732,11 +15325,15 @@ define amdgpu_kernel void @private_singlethread_one_as_unordered_load( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -11746,11 +15343,15 @@ define amdgpu_kernel void @private_singlethread_one_as_unordered_load( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -11760,11 +15361,15 @@ define amdgpu_kernel void @private_singlethread_one_as_unordered_load( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -11778,11 +15383,15 @@ define amdgpu_kernel void @private_singlethread_one_as_unordered_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -11792,11 +15401,15 @@ define amdgpu_kernel void @private_singlethread_one_as_unordered_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -11806,11 +15419,15 @@ define amdgpu_kernel void @private_singlethread_one_as_unordered_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -11818,84 +15435,108 @@ define amdgpu_kernel void @private_singlethread_one_as_unordered_load( ; ; GFX942-NOTTGSPLIT-LABEL: private_singlethread_one_as_unordered_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_singlethread_one_as_unordered_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_singlethread_one_as_unordered_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_singlethread_one_as_unordered_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_singlethread_one_as_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_singlethread_one_as_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: private_singlethread_one_as_unordered_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -11904,6 +15545,7 @@ define amdgpu_kernel void @private_singlethread_one_as_unordered_load( ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: flat_load_b32 v0, v[0:1] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %in, ptr addrspace(5) %out) { @@ -11918,11 +15560,15 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_load( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -11932,11 +15578,15 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_load( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -11946,11 +15596,15 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_load( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -11960,11 +15614,15 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_load( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -11978,11 +15636,15 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -11992,11 +15654,15 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12006,11 +15672,15 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12018,84 +15688,108 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_load( ; ; GFX942-NOTTGSPLIT-LABEL: private_singlethread_one_as_monotonic_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_singlethread_one_as_monotonic_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_singlethread_one_as_monotonic_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_singlethread_one_as_monotonic_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_singlethread_one_as_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_singlethread_one_as_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: private_singlethread_one_as_monotonic_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -12104,6 +15798,7 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_load( ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: flat_load_b32 v0, v[0:1] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %in, ptr addrspace(5) %out) { @@ -12118,11 +15813,15 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_load( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12132,11 +15831,15 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_load( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12146,11 +15849,15 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_load( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12160,11 +15867,15 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_load( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12178,11 +15889,15 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -12192,11 +15907,15 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12206,11 +15925,15 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12218,84 +15941,108 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_load( ; ; GFX942-NOTTGSPLIT-LABEL: private_singlethread_one_as_acquire_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_singlethread_one_as_acquire_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_singlethread_one_as_acquire_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_singlethread_one_as_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_singlethread_one_as_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_singlethread_one_as_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: private_singlethread_one_as_acquire_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -12304,6 +16051,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_load( ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: flat_load_b32 v0, v[0:1] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %in, ptr addrspace(5) %out) { @@ -12318,11 +16066,15 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_load( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12332,11 +16084,15 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_load( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12346,11 +16102,15 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_load( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12360,11 +16120,15 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_load( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12378,11 +16142,15 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -12392,11 +16160,15 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12406,11 +16178,15 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12418,84 +16194,108 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_load( ; ; GFX942-NOTTGSPLIT-LABEL: private_singlethread_one_as_seq_cst_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_singlethread_one_as_seq_cst_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_singlethread_one_as_seq_cst_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_singlethread_one_as_seq_cst_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_singlethread_one_as_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_singlethread_one_as_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: private_singlethread_one_as_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -12504,6 +16304,7 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_load( ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: flat_load_b32 v0, v[0:1] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %in, ptr addrspace(5) %out) { @@ -12518,10 +16319,14 @@ define amdgpu_kernel void @private_singlethread_one_as_unordered_store( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX6-NEXT: s_endpgm @@ -12530,10 +16335,14 @@ define amdgpu_kernel void @private_singlethread_one_as_unordered_store( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX7-NEXT: s_endpgm @@ -12542,10 +16351,14 @@ define amdgpu_kernel void @private_singlethread_one_as_unordered_store( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-WGP-NEXT: s_endpgm @@ -12554,10 +16367,14 @@ define amdgpu_kernel void @private_singlethread_one_as_unordered_store( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-CU-NEXT: s_endpgm @@ -12570,10 +16387,14 @@ define amdgpu_kernel void @private_singlethread_one_as_unordered_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -12582,10 +16403,14 @@ define amdgpu_kernel void @private_singlethread_one_as_unordered_store( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -12594,65 +16419,93 @@ define amdgpu_kernel void @private_singlethread_one_as_unordered_store( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: private_singlethread_one_as_unordered_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_singlethread_one_as_unordered_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_singlethread_one_as_unordered_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_singlethread_one_as_unordered_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_singlethread_one_as_unordered_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_singlethread_one_as_unordered_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; @@ -12660,22 +16513,26 @@ define amdgpu_kernel void @private_singlethread_one_as_unordered_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -12697,10 +16554,14 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_store( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX6-NEXT: s_endpgm @@ -12709,10 +16570,14 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_store( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX7-NEXT: s_endpgm @@ -12721,10 +16586,14 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_store( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-WGP-NEXT: s_endpgm @@ -12733,10 +16602,14 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_store( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-CU-NEXT: s_endpgm @@ -12749,10 +16622,14 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -12761,10 +16638,14 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_store( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -12773,65 +16654,93 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_store( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: private_singlethread_one_as_monotonic_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_singlethread_one_as_monotonic_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_singlethread_one_as_monotonic_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_singlethread_one_as_monotonic_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_singlethread_one_as_monotonic_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_singlethread_one_as_monotonic_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; @@ -12839,22 +16748,26 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -12876,10 +16789,14 @@ define amdgpu_kernel void @private_singlethread_one_as_release_store( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX6-NEXT: s_endpgm @@ -12888,10 +16805,14 @@ define amdgpu_kernel void @private_singlethread_one_as_release_store( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX7-NEXT: s_endpgm @@ -12900,10 +16821,14 @@ define amdgpu_kernel void @private_singlethread_one_as_release_store( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-WGP-NEXT: s_endpgm @@ -12912,10 +16837,14 @@ define amdgpu_kernel void @private_singlethread_one_as_release_store( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-CU-NEXT: s_endpgm @@ -12928,10 +16857,14 @@ define amdgpu_kernel void @private_singlethread_one_as_release_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -12940,10 +16873,14 @@ define amdgpu_kernel void @private_singlethread_one_as_release_store( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -12952,65 +16889,93 @@ define amdgpu_kernel void @private_singlethread_one_as_release_store( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: private_singlethread_one_as_release_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_singlethread_one_as_release_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_singlethread_one_as_release_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_singlethread_one_as_release_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_singlethread_one_as_release_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_singlethread_one_as_release_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; @@ -13018,22 +16983,26 @@ define amdgpu_kernel void @private_singlethread_one_as_release_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -13055,10 +17024,14 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_store( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX6-NEXT: s_endpgm @@ -13067,10 +17040,14 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_store( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX7-NEXT: s_endpgm @@ -13079,10 +17056,14 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_store( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-WGP-NEXT: s_endpgm @@ -13091,10 +17072,14 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_store( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-CU-NEXT: s_endpgm @@ -13107,10 +17092,14 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -13119,10 +17108,14 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_store( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -13131,65 +17124,93 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_store( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: private_singlethread_one_as_seq_cst_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_singlethread_one_as_seq_cst_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_singlethread_one_as_seq_cst_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_singlethread_one_as_seq_cst_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_singlethread_one_as_seq_cst_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_singlethread_one_as_seq_cst_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; @@ -13197,22 +17218,26 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -13235,8 +17260,15 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13247,8 +17279,15 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13259,8 +17298,15 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13271,8 +17317,15 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13287,8 +17340,15 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -13299,8 +17359,15 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13311,8 +17378,15 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13321,8 +17395,14 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_singlethread_one_as_monotonic_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -13330,8 +17410,14 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_singlethread_one_as_monotonic_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm @@ -13339,8 +17425,14 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_atomicrmw( ; GFX11-WGP-LABEL: private_singlethread_one_as_monotonic_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm @@ -13348,8 +17440,14 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_atomicrmw( ; GFX11-CU-LABEL: private_singlethread_one_as_monotonic_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm @@ -13357,8 +17455,14 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_atomicrmw( ; GFX12-WGP-LABEL: private_singlethread_one_as_monotonic_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm @@ -13366,8 +17470,14 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_atomicrmw( ; GFX12-CU-LABEL: private_singlethread_one_as_monotonic_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm @@ -13375,29 +17485,34 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_atomicrmw( ; GFX1250-LABEL: private_singlethread_one_as_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 ; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 @@ -13414,8 +17529,15 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13426,8 +17548,15 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13438,8 +17567,15 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13450,8 +17586,15 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13466,8 +17609,15 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -13478,8 +17628,15 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13490,8 +17647,15 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13500,8 +17664,14 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_singlethread_one_as_acquire_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -13509,8 +17679,14 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_singlethread_one_as_acquire_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm @@ -13518,8 +17694,14 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_atomicrmw( ; GFX11-WGP-LABEL: private_singlethread_one_as_acquire_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm @@ -13527,8 +17709,14 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_atomicrmw( ; GFX11-CU-LABEL: private_singlethread_one_as_acquire_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm @@ -13536,8 +17724,14 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_atomicrmw( ; GFX12-WGP-LABEL: private_singlethread_one_as_acquire_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm @@ -13545,8 +17739,14 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_atomicrmw( ; GFX12-CU-LABEL: private_singlethread_one_as_acquire_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm @@ -13554,29 +17754,34 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_atomicrmw( ; GFX1250-LABEL: private_singlethread_one_as_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 ; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 @@ -13593,8 +17798,15 @@ define amdgpu_kernel void @private_singlethread_one_as_release_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13605,8 +17817,15 @@ define amdgpu_kernel void @private_singlethread_one_as_release_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13617,8 +17836,15 @@ define amdgpu_kernel void @private_singlethread_one_as_release_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13629,8 +17855,15 @@ define amdgpu_kernel void @private_singlethread_one_as_release_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13645,8 +17878,15 @@ define amdgpu_kernel void @private_singlethread_one_as_release_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -13657,8 +17897,15 @@ define amdgpu_kernel void @private_singlethread_one_as_release_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13669,8 +17916,15 @@ define amdgpu_kernel void @private_singlethread_one_as_release_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13679,8 +17933,14 @@ define amdgpu_kernel void @private_singlethread_one_as_release_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_singlethread_one_as_release_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -13688,8 +17948,14 @@ define amdgpu_kernel void @private_singlethread_one_as_release_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_singlethread_one_as_release_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm @@ -13697,8 +17963,14 @@ define amdgpu_kernel void @private_singlethread_one_as_release_atomicrmw( ; GFX11-WGP-LABEL: private_singlethread_one_as_release_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm @@ -13706,8 +17978,14 @@ define amdgpu_kernel void @private_singlethread_one_as_release_atomicrmw( ; GFX11-CU-LABEL: private_singlethread_one_as_release_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm @@ -13715,8 +17993,14 @@ define amdgpu_kernel void @private_singlethread_one_as_release_atomicrmw( ; GFX12-WGP-LABEL: private_singlethread_one_as_release_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm @@ -13724,8 +18008,14 @@ define amdgpu_kernel void @private_singlethread_one_as_release_atomicrmw( ; GFX12-CU-LABEL: private_singlethread_one_as_release_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm @@ -13733,29 +18023,34 @@ define amdgpu_kernel void @private_singlethread_one_as_release_atomicrmw( ; GFX1250-LABEL: private_singlethread_one_as_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 ; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 @@ -13772,8 +18067,15 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13784,8 +18086,15 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13796,8 +18105,15 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13808,8 +18124,15 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13824,8 +18147,15 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -13836,8 +18166,15 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13848,8 +18185,15 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13858,8 +18202,14 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_singlethread_one_as_acq_rel_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -13867,8 +18217,14 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_singlethread_one_as_acq_rel_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm @@ -13876,8 +18232,14 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_atomicrmw( ; GFX11-WGP-LABEL: private_singlethread_one_as_acq_rel_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm @@ -13885,8 +18247,14 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_atomicrmw( ; GFX11-CU-LABEL: private_singlethread_one_as_acq_rel_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm @@ -13894,8 +18262,14 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_atomicrmw( ; GFX12-WGP-LABEL: private_singlethread_one_as_acq_rel_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm @@ -13903,8 +18277,14 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_atomicrmw( ; GFX12-CU-LABEL: private_singlethread_one_as_acq_rel_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm @@ -13912,29 +18292,34 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_atomicrmw( ; GFX1250-LABEL: private_singlethread_one_as_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 ; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 @@ -13951,8 +18336,15 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13963,8 +18355,15 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13975,8 +18374,15 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13987,8 +18393,15 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -14003,8 +18416,15 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -14015,8 +18435,15 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -14027,8 +18454,15 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -14037,8 +18471,14 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_singlethread_one_as_seq_cst_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -14046,8 +18486,14 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_singlethread_one_as_seq_cst_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm @@ -14055,8 +18501,14 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_atomicrmw( ; GFX11-WGP-LABEL: private_singlethread_one_as_seq_cst_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm @@ -14064,8 +18516,14 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_atomicrmw( ; GFX11-CU-LABEL: private_singlethread_one_as_seq_cst_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm @@ -14073,8 +18531,14 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_atomicrmw( ; GFX12-WGP-LABEL: private_singlethread_one_as_seq_cst_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm @@ -14082,8 +18546,14 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_atomicrmw( ; GFX12-CU-LABEL: private_singlethread_one_as_seq_cst_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm @@ -14091,29 +18561,34 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_atomicrmw( ; GFX1250-LABEL: private_singlethread_one_as_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 ; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 @@ -14130,6 +18605,10 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_ret_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -14148,6 +18627,10 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_ret_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -14165,6 +18648,10 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_ret_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -14182,6 +18669,10 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_ret_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -14203,6 +18694,10 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -14220,6 +18715,10 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -14237,6 +18736,10 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -14252,6 +18755,10 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_singlethread_one_as_acquire_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 @@ -14264,6 +18771,10 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_singlethread_one_as_acquire_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 @@ -14276,6 +18787,10 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_ret_atomicrmw( ; GFX11-WGP-LABEL: private_singlethread_one_as_acquire_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 @@ -14288,6 +18803,10 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_ret_atomicrmw( ; GFX11-CU-LABEL: private_singlethread_one_as_acquire_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 @@ -14300,6 +18819,10 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_ret_atomicrmw( ; GFX12-WGP-LABEL: private_singlethread_one_as_acquire_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 @@ -14312,6 +18835,10 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_ret_atomicrmw( ; GFX12-CU-LABEL: private_singlethread_one_as_acquire_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 @@ -14325,22 +18852,26 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_ret_atomicrmw( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_lg_u32 s0, s2 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b32 s2, 0 ; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s0, s3 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -14366,6 +18897,10 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -14384,6 +18919,10 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -14401,6 +18940,10 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -14418,6 +18961,10 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -14439,6 +18986,10 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -14456,6 +19007,10 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -14473,6 +19028,10 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -14488,6 +19047,10 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 @@ -14500,6 +19063,10 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 @@ -14512,6 +19079,10 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX11-WGP-LABEL: private_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 @@ -14524,6 +19095,10 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX11-CU-LABEL: private_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 @@ -14536,6 +19111,10 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX12-WGP-LABEL: private_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 @@ -14548,6 +19127,10 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-LABEL: private_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 @@ -14561,22 +19144,26 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_lg_u32 s0, s2 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b32 s2, 0 ; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s0, s3 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -14602,6 +19189,10 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -14620,6 +19211,10 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -14637,6 +19232,10 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -14654,6 +19253,10 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -14675,6 +19278,10 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -14692,6 +19299,10 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -14709,6 +19320,10 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -14724,6 +19339,10 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 @@ -14736,6 +19355,10 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 @@ -14748,6 +19371,10 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX11-WGP-LABEL: private_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 @@ -14760,6 +19387,10 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX11-CU-LABEL: private_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 @@ -14772,6 +19403,10 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX12-WGP-LABEL: private_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 @@ -14784,6 +19419,10 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-LABEL: private_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 @@ -14797,22 +19436,26 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_lg_u32 s0, s2 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b32 s2, 0 ; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s0, s3 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -14838,6 +19481,12 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_monotonic_cmpxc ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -14858,6 +19507,12 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_monotonic_cmpxc ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -14878,6 +19533,12 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_monotonic_cmpxc ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -14895,6 +19556,12 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_monotonic_cmpxc ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -14916,6 +19583,12 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_monotonic_cmpxc ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -14936,6 +19609,12 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_monotonic_cmpxc ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14954,6 +19633,12 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_monotonic_cmpxc ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14970,6 +19655,12 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_monotonic_cmpxc ; GFX942-NOTTGSPLIT-LABEL: private_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -14987,6 +19678,12 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_monotonic_cmpxc ; GFX942-TGSPLIT-LABEL: private_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -15004,6 +19701,12 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_monotonic_cmpxc ; GFX11-WGP-LABEL: private_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -15019,6 +19722,12 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_monotonic_cmpxc ; GFX11-CU-LABEL: private_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -15034,6 +19743,12 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_monotonic_cmpxc ; GFX12-WGP-LABEL: private_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -15048,6 +19763,12 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_monotonic_cmpxc ; GFX12-CU-LABEL: private_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -15062,26 +19783,32 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_monotonic_cmpxc ; GFX1250-LABEL: private_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -15089,6 +19816,7 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_monotonic_cmpxc ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -15108,6 +19836,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_monotonic_cmpxchg ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -15128,6 +19862,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_monotonic_cmpxchg ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -15148,6 +19888,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_monotonic_cmpxchg ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -15165,6 +19911,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_monotonic_cmpxchg ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -15186,6 +19938,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_monotonic_cmpxchg ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -15206,6 +19964,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_monotonic_cmpxchg ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15224,6 +19988,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_monotonic_cmpxchg ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15240,6 +20010,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_monotonic_cmpxchg ; GFX942-NOTTGSPLIT-LABEL: private_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -15257,6 +20033,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_monotonic_cmpxchg ; GFX942-TGSPLIT-LABEL: private_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -15274,6 +20056,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_monotonic_cmpxchg ; GFX11-WGP-LABEL: private_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -15289,6 +20077,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_monotonic_cmpxchg ; GFX11-CU-LABEL: private_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -15304,6 +20098,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_monotonic_cmpxchg ; GFX12-WGP-LABEL: private_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -15318,6 +20118,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_monotonic_cmpxchg ; GFX12-CU-LABEL: private_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -15332,26 +20138,32 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_monotonic_cmpxchg ; GFX1250-LABEL: private_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -15359,6 +20171,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_monotonic_cmpxchg ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -15378,6 +20191,12 @@ define amdgpu_kernel void @private_singlethread_one_as_release_monotonic_cmpxchg ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -15398,6 +20217,12 @@ define amdgpu_kernel void @private_singlethread_one_as_release_monotonic_cmpxchg ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -15418,6 +20243,12 @@ define amdgpu_kernel void @private_singlethread_one_as_release_monotonic_cmpxchg ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -15435,6 +20266,12 @@ define amdgpu_kernel void @private_singlethread_one_as_release_monotonic_cmpxchg ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -15456,6 +20293,12 @@ define amdgpu_kernel void @private_singlethread_one_as_release_monotonic_cmpxchg ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -15476,6 +20319,12 @@ define amdgpu_kernel void @private_singlethread_one_as_release_monotonic_cmpxchg ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15494,6 +20343,12 @@ define amdgpu_kernel void @private_singlethread_one_as_release_monotonic_cmpxchg ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15510,6 +20365,12 @@ define amdgpu_kernel void @private_singlethread_one_as_release_monotonic_cmpxchg ; GFX942-NOTTGSPLIT-LABEL: private_singlethread_one_as_release_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -15527,6 +20388,12 @@ define amdgpu_kernel void @private_singlethread_one_as_release_monotonic_cmpxchg ; GFX942-TGSPLIT-LABEL: private_singlethread_one_as_release_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -15544,6 +20411,12 @@ define amdgpu_kernel void @private_singlethread_one_as_release_monotonic_cmpxchg ; GFX11-WGP-LABEL: private_singlethread_one_as_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -15559,6 +20432,12 @@ define amdgpu_kernel void @private_singlethread_one_as_release_monotonic_cmpxchg ; GFX11-CU-LABEL: private_singlethread_one_as_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -15574,6 +20453,12 @@ define amdgpu_kernel void @private_singlethread_one_as_release_monotonic_cmpxchg ; GFX12-WGP-LABEL: private_singlethread_one_as_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -15588,6 +20473,12 @@ define amdgpu_kernel void @private_singlethread_one_as_release_monotonic_cmpxchg ; GFX12-CU-LABEL: private_singlethread_one_as_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -15602,26 +20493,32 @@ define amdgpu_kernel void @private_singlethread_one_as_release_monotonic_cmpxchg ; GFX1250-LABEL: private_singlethread_one_as_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -15629,6 +20526,7 @@ define amdgpu_kernel void @private_singlethread_one_as_release_monotonic_cmpxchg ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -15648,6 +20546,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_monotonic_cmpxchg ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -15668,6 +20572,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_monotonic_cmpxchg ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -15688,6 +20598,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_monotonic_cmpxchg ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -15705,6 +20621,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_monotonic_cmpxchg ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -15726,6 +20648,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_monotonic_cmpxchg ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -15746,6 +20674,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_monotonic_cmpxchg ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15764,6 +20698,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_monotonic_cmpxchg ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15780,6 +20720,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_monotonic_cmpxchg ; GFX942-NOTTGSPLIT-LABEL: private_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -15797,6 +20743,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_monotonic_cmpxchg ; GFX942-TGSPLIT-LABEL: private_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -15814,6 +20766,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_monotonic_cmpxchg ; GFX11-WGP-LABEL: private_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -15829,6 +20787,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_monotonic_cmpxchg ; GFX11-CU-LABEL: private_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -15844,6 +20808,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_monotonic_cmpxchg ; GFX12-WGP-LABEL: private_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -15858,6 +20828,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_monotonic_cmpxchg ; GFX12-CU-LABEL: private_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -15872,26 +20848,32 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_monotonic_cmpxchg ; GFX1250-LABEL: private_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -15899,6 +20881,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_monotonic_cmpxchg ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -15918,6 +20901,12 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_monotonic_cmpxchg ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -15938,6 +20927,12 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_monotonic_cmpxchg ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -15958,6 +20953,12 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_monotonic_cmpxchg ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -15975,6 +20976,12 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_monotonic_cmpxchg ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -15996,6 +21003,12 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_monotonic_cmpxchg ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -16016,6 +21029,12 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_monotonic_cmpxchg ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16034,6 +21053,12 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_monotonic_cmpxchg ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16050,6 +21075,12 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_monotonic_cmpxchg ; GFX942-NOTTGSPLIT-LABEL: private_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -16067,6 +21098,12 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_monotonic_cmpxchg ; GFX942-TGSPLIT-LABEL: private_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -16084,6 +21121,12 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_monotonic_cmpxchg ; GFX11-WGP-LABEL: private_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -16099,6 +21142,12 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_monotonic_cmpxchg ; GFX11-CU-LABEL: private_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -16114,6 +21163,12 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_monotonic_cmpxchg ; GFX12-WGP-LABEL: private_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -16128,6 +21183,12 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_monotonic_cmpxchg ; GFX12-CU-LABEL: private_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -16142,26 +21203,32 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_monotonic_cmpxchg ; GFX1250-LABEL: private_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -16169,6 +21236,7 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_monotonic_cmpxchg ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -16188,6 +21256,12 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_acquire_cmpxchg ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -16208,6 +21282,12 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_acquire_cmpxchg ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -16228,6 +21308,12 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_acquire_cmpxchg ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16245,6 +21331,12 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_acquire_cmpxchg ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16266,6 +21358,12 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_acquire_cmpxchg ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -16286,6 +21384,12 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_acquire_cmpxchg ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16304,6 +21408,12 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_acquire_cmpxchg ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16320,6 +21430,12 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_acquire_cmpxchg ; GFX942-NOTTGSPLIT-LABEL: private_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -16337,6 +21453,12 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_acquire_cmpxchg ; GFX942-TGSPLIT-LABEL: private_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -16354,6 +21476,12 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_acquire_cmpxchg ; GFX11-WGP-LABEL: private_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -16369,6 +21497,12 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_acquire_cmpxchg ; GFX11-CU-LABEL: private_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -16384,6 +21518,12 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_acquire_cmpxchg ; GFX12-WGP-LABEL: private_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -16398,6 +21538,12 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_acquire_cmpxchg ; GFX12-CU-LABEL: private_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -16412,26 +21558,32 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_acquire_cmpxchg ; GFX1250-LABEL: private_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -16439,6 +21591,7 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_acquire_cmpxchg ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -16458,6 +21611,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -16478,6 +21637,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -16498,6 +21663,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16515,6 +21686,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16536,6 +21713,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -16556,6 +21739,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16574,6 +21763,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16590,6 +21785,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -16607,6 +21808,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -16624,6 +21831,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX11-WGP-LABEL: private_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -16639,6 +21852,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX11-CU-LABEL: private_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -16654,6 +21873,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX12-WGP-LABEL: private_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -16668,6 +21893,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-LABEL: private_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -16682,26 +21913,32 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX1250-LABEL: private_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -16709,6 +21946,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -16728,6 +21966,12 @@ define amdgpu_kernel void @private_singlethread_one_as_release_acquire_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -16748,6 +21992,12 @@ define amdgpu_kernel void @private_singlethread_one_as_release_acquire_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -16768,6 +22018,12 @@ define amdgpu_kernel void @private_singlethread_one_as_release_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16785,6 +22041,12 @@ define amdgpu_kernel void @private_singlethread_one_as_release_acquire_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16806,6 +22068,12 @@ define amdgpu_kernel void @private_singlethread_one_as_release_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -16826,6 +22094,12 @@ define amdgpu_kernel void @private_singlethread_one_as_release_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16844,6 +22118,12 @@ define amdgpu_kernel void @private_singlethread_one_as_release_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16860,6 +22140,12 @@ define amdgpu_kernel void @private_singlethread_one_as_release_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_singlethread_one_as_release_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -16877,6 +22163,12 @@ define amdgpu_kernel void @private_singlethread_one_as_release_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_singlethread_one_as_release_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -16894,6 +22186,12 @@ define amdgpu_kernel void @private_singlethread_one_as_release_acquire_cmpxchg( ; GFX11-WGP-LABEL: private_singlethread_one_as_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -16909,6 +22207,12 @@ define amdgpu_kernel void @private_singlethread_one_as_release_acquire_cmpxchg( ; GFX11-CU-LABEL: private_singlethread_one_as_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -16924,6 +22228,12 @@ define amdgpu_kernel void @private_singlethread_one_as_release_acquire_cmpxchg( ; GFX12-WGP-LABEL: private_singlethread_one_as_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -16938,6 +22248,12 @@ define amdgpu_kernel void @private_singlethread_one_as_release_acquire_cmpxchg( ; GFX12-CU-LABEL: private_singlethread_one_as_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -16952,26 +22268,32 @@ define amdgpu_kernel void @private_singlethread_one_as_release_acquire_cmpxchg( ; GFX1250-LABEL: private_singlethread_one_as_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -16979,6 +22301,7 @@ define amdgpu_kernel void @private_singlethread_one_as_release_acquire_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -16998,6 +22321,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -17018,6 +22347,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -17038,6 +22373,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17055,6 +22396,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17076,6 +22423,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -17096,6 +22449,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17114,6 +22473,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17130,6 +22495,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -17147,6 +22518,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -17164,6 +22541,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX11-WGP-LABEL: private_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -17179,6 +22562,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX11-CU-LABEL: private_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -17194,6 +22583,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX12-WGP-LABEL: private_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -17208,6 +22603,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-LABEL: private_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -17222,26 +22623,32 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX1250-LABEL: private_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -17249,6 +22656,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -17268,6 +22676,12 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -17288,6 +22702,12 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -17308,6 +22728,12 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17325,6 +22751,12 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17346,6 +22778,12 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -17366,6 +22804,12 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17384,6 +22828,12 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17400,6 +22850,12 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -17417,6 +22873,12 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -17434,6 +22896,12 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX11-WGP-LABEL: private_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -17449,6 +22917,12 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX11-CU-LABEL: private_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -17464,6 +22938,12 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX12-WGP-LABEL: private_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -17478,6 +22958,12 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-LABEL: private_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -17492,26 +22978,32 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX1250-LABEL: private_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -17519,6 +23011,7 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -17538,6 +23031,12 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_seq_cst_cmpxchg ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -17558,6 +23057,12 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_seq_cst_cmpxchg ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -17578,6 +23083,12 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_seq_cst_cmpxchg ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17595,6 +23106,12 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_seq_cst_cmpxchg ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17616,6 +23133,12 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_seq_cst_cmpxchg ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -17636,6 +23159,12 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_seq_cst_cmpxchg ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17654,6 +23183,12 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_seq_cst_cmpxchg ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17670,6 +23205,12 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_seq_cst_cmpxchg ; GFX942-NOTTGSPLIT-LABEL: private_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -17687,6 +23228,12 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_seq_cst_cmpxchg ; GFX942-TGSPLIT-LABEL: private_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -17704,6 +23251,12 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_seq_cst_cmpxchg ; GFX11-WGP-LABEL: private_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -17719,6 +23272,12 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_seq_cst_cmpxchg ; GFX11-CU-LABEL: private_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -17734,6 +23293,12 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_seq_cst_cmpxchg ; GFX12-WGP-LABEL: private_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -17748,6 +23313,12 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_seq_cst_cmpxchg ; GFX12-CU-LABEL: private_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -17762,26 +23333,32 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_seq_cst_cmpxchg ; GFX1250-LABEL: private_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -17789,6 +23366,7 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_seq_cst_cmpxchg ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -17808,6 +23386,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -17828,6 +23412,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -17848,6 +23438,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17865,6 +23461,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17886,6 +23488,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -17906,6 +23514,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17924,6 +23538,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17940,6 +23560,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -17957,6 +23583,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -17974,6 +23606,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: private_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -17989,6 +23627,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX11-CU-LABEL: private_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -18004,6 +23648,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: private_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -18018,6 +23668,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-LABEL: private_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -18032,26 +23688,32 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX1250-LABEL: private_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -18059,6 +23721,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -18078,6 +23741,12 @@ define amdgpu_kernel void @private_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -18098,6 +23767,12 @@ define amdgpu_kernel void @private_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -18118,6 +23793,12 @@ define amdgpu_kernel void @private_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -18135,6 +23816,12 @@ define amdgpu_kernel void @private_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -18156,6 +23843,12 @@ define amdgpu_kernel void @private_singlethread_one_as_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -18176,6 +23869,12 @@ define amdgpu_kernel void @private_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18194,6 +23893,12 @@ define amdgpu_kernel void @private_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18210,6 +23915,12 @@ define amdgpu_kernel void @private_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -18227,6 +23938,12 @@ define amdgpu_kernel void @private_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -18244,6 +23961,12 @@ define amdgpu_kernel void @private_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: private_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -18259,6 +23982,12 @@ define amdgpu_kernel void @private_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX11-CU-LABEL: private_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -18274,6 +24003,12 @@ define amdgpu_kernel void @private_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: private_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -18288,6 +24023,12 @@ define amdgpu_kernel void @private_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-LABEL: private_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -18302,26 +24043,32 @@ define amdgpu_kernel void @private_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX1250-LABEL: private_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -18329,6 +24076,7 @@ define amdgpu_kernel void @private_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -18348,6 +24096,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -18368,6 +24122,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -18388,6 +24148,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -18405,6 +24171,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -18426,6 +24198,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -18446,6 +24224,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18464,6 +24248,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18480,6 +24270,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -18497,6 +24293,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -18514,6 +24316,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: private_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -18529,6 +24337,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX11-CU-LABEL: private_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -18544,6 +24358,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: private_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -18558,6 +24378,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-LABEL: private_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -18572,26 +24398,32 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX1250-LABEL: private_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -18599,6 +24431,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -18618,6 +24451,12 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -18638,6 +24477,12 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -18658,6 +24503,12 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -18675,6 +24526,12 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -18696,6 +24553,12 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -18716,6 +24579,12 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18734,6 +24603,12 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18750,6 +24625,12 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -18767,6 +24648,12 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -18784,6 +24671,12 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: private_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -18799,6 +24692,12 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX11-CU-LABEL: private_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -18814,6 +24713,12 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: private_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -18828,6 +24733,12 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-LABEL: private_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -18842,26 +24753,32 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX1250-LABEL: private_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -18869,6 +24786,7 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -18889,6 +24807,12 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_monotonic_ret_c ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -18913,6 +24837,12 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_monotonic_ret_c ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -18935,6 +24865,12 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_monotonic_ret_c ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -18954,6 +24890,12 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_monotonic_ret_c ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -18978,6 +24920,12 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_monotonic_ret_c ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -19000,6 +24948,12 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_monotonic_ret_c ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19020,6 +24974,12 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_monotonic_ret_c ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19039,6 +24999,12 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_monotonic_ret_c ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -19059,6 +25025,12 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_monotonic_ret_c ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -19078,6 +25050,12 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_monotonic_ret_c ; GFX11-WGP-LABEL: private_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -19094,6 +25072,12 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_monotonic_ret_c ; GFX11-CU-LABEL: private_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -19110,6 +25094,12 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_monotonic_ret_c ; GFX12-WGP-LABEL: private_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -19125,6 +25115,12 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_monotonic_ret_c ; GFX12-CU-LABEL: private_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -19141,25 +25137,31 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_monotonic_ret_c ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -19191,6 +25193,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_monotonic_ret_cmp ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -19215,6 +25223,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_monotonic_ret_cmp ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -19237,6 +25251,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_monotonic_ret_cmp ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -19256,6 +25276,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_monotonic_ret_cmp ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -19280,6 +25306,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_monotonic_ret_cmp ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -19302,6 +25334,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_monotonic_ret_cmp ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19322,6 +25360,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_monotonic_ret_cmp ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19341,6 +25385,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_monotonic_ret_cmp ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -19361,6 +25411,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_monotonic_ret_cmp ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -19380,6 +25436,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_monotonic_ret_cmp ; GFX11-WGP-LABEL: private_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -19396,6 +25458,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_monotonic_ret_cmp ; GFX11-CU-LABEL: private_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -19412,6 +25480,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_monotonic_ret_cmp ; GFX12-WGP-LABEL: private_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -19427,6 +25501,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_monotonic_ret_cmp ; GFX12-CU-LABEL: private_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -19443,25 +25523,31 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_monotonic_ret_cmp ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -19493,6 +25579,12 @@ define amdgpu_kernel void @private_singlethread_one_as_release_monotonic_ret_cmp ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -19517,6 +25609,12 @@ define amdgpu_kernel void @private_singlethread_one_as_release_monotonic_ret_cmp ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -19539,6 +25637,12 @@ define amdgpu_kernel void @private_singlethread_one_as_release_monotonic_ret_cmp ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -19558,6 +25662,12 @@ define amdgpu_kernel void @private_singlethread_one_as_release_monotonic_ret_cmp ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -19582,6 +25692,12 @@ define amdgpu_kernel void @private_singlethread_one_as_release_monotonic_ret_cmp ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -19604,6 +25720,12 @@ define amdgpu_kernel void @private_singlethread_one_as_release_monotonic_ret_cmp ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19624,6 +25746,12 @@ define amdgpu_kernel void @private_singlethread_one_as_release_monotonic_ret_cmp ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19643,6 +25771,12 @@ define amdgpu_kernel void @private_singlethread_one_as_release_monotonic_ret_cmp ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -19663,6 +25797,12 @@ define amdgpu_kernel void @private_singlethread_one_as_release_monotonic_ret_cmp ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -19682,6 +25822,12 @@ define amdgpu_kernel void @private_singlethread_one_as_release_monotonic_ret_cmp ; GFX11-WGP-LABEL: private_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -19698,6 +25844,12 @@ define amdgpu_kernel void @private_singlethread_one_as_release_monotonic_ret_cmp ; GFX11-CU-LABEL: private_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -19714,6 +25866,12 @@ define amdgpu_kernel void @private_singlethread_one_as_release_monotonic_ret_cmp ; GFX12-WGP-LABEL: private_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -19729,6 +25887,12 @@ define amdgpu_kernel void @private_singlethread_one_as_release_monotonic_ret_cmp ; GFX12-CU-LABEL: private_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -19745,25 +25909,31 @@ define amdgpu_kernel void @private_singlethread_one_as_release_monotonic_ret_cmp ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -19795,6 +25965,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_monotonic_ret_cmp ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -19819,6 +25995,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_monotonic_ret_cmp ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -19841,6 +26023,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_monotonic_ret_cmp ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -19860,6 +26048,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_monotonic_ret_cmp ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -19884,6 +26078,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_monotonic_ret_cmp ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -19906,6 +26106,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_monotonic_ret_cmp ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19926,6 +26132,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_monotonic_ret_cmp ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19945,6 +26157,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_monotonic_ret_cmp ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -19965,6 +26183,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_monotonic_ret_cmp ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -19984,6 +26208,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_monotonic_ret_cmp ; GFX11-WGP-LABEL: private_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -20000,6 +26230,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_monotonic_ret_cmp ; GFX11-CU-LABEL: private_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -20016,6 +26252,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_monotonic_ret_cmp ; GFX12-WGP-LABEL: private_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -20031,6 +26273,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_monotonic_ret_cmp ; GFX12-CU-LABEL: private_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -20047,25 +26295,31 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_monotonic_ret_cmp ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -20097,6 +26351,12 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_monotonic_ret_cmp ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -20121,6 +26381,12 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_monotonic_ret_cmp ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -20143,6 +26409,12 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_monotonic_ret_cmp ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -20162,6 +26434,12 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_monotonic_ret_cmp ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -20186,6 +26464,12 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_monotonic_ret_cmp ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -20208,6 +26492,12 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_monotonic_ret_cmp ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20228,6 +26518,12 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_monotonic_ret_cmp ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20247,6 +26543,12 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_monotonic_ret_cmp ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -20267,6 +26569,12 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_monotonic_ret_cmp ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -20286,6 +26594,12 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_monotonic_ret_cmp ; GFX11-WGP-LABEL: private_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -20302,6 +26616,12 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_monotonic_ret_cmp ; GFX11-CU-LABEL: private_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -20318,6 +26638,12 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_monotonic_ret_cmp ; GFX12-WGP-LABEL: private_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -20333,6 +26659,12 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_monotonic_ret_cmp ; GFX12-CU-LABEL: private_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -20349,25 +26681,31 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_monotonic_ret_cmp ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -20399,6 +26737,12 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_acquire_ret_cmp ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -20423,6 +26767,12 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_acquire_ret_cmp ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -20445,6 +26795,12 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_acquire_ret_cmp ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -20464,6 +26820,12 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_acquire_ret_cmp ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -20488,6 +26850,12 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_acquire_ret_cmp ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -20510,6 +26878,12 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_acquire_ret_cmp ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20530,6 +26904,12 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_acquire_ret_cmp ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20549,6 +26929,12 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_acquire_ret_cmp ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -20569,6 +26955,12 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_acquire_ret_cmp ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -20588,6 +26980,12 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_acquire_ret_cmp ; GFX11-WGP-LABEL: private_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -20604,6 +27002,12 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_acquire_ret_cmp ; GFX11-CU-LABEL: private_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -20620,6 +27024,12 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_acquire_ret_cmp ; GFX12-WGP-LABEL: private_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -20635,6 +27045,12 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_acquire_ret_cmp ; GFX12-CU-LABEL: private_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -20651,25 +27067,31 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_acquire_ret_cmp ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -20701,6 +27123,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_acquire_ret_cmpxc ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -20725,6 +27153,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_acquire_ret_cmpxc ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -20747,6 +27181,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_acquire_ret_cmpxc ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -20766,6 +27206,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_acquire_ret_cmpxc ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -20790,6 +27236,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_acquire_ret_cmpxc ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -20812,6 +27264,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_acquire_ret_cmpxc ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20832,6 +27290,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_acquire_ret_cmpxc ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20851,6 +27315,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_acquire_ret_cmpxc ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -20871,6 +27341,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_acquire_ret_cmpxc ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -20890,6 +27366,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_acquire_ret_cmpxc ; GFX11-WGP-LABEL: private_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -20906,6 +27388,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_acquire_ret_cmpxc ; GFX11-CU-LABEL: private_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -20922,6 +27410,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_acquire_ret_cmpxc ; GFX12-WGP-LABEL: private_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -20937,6 +27431,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_acquire_ret_cmpxc ; GFX12-CU-LABEL: private_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -20953,25 +27453,31 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_acquire_ret_cmpxc ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -21003,6 +27509,12 @@ define amdgpu_kernel void @private_singlethread_one_as_release_acquire_ret_cmpxc ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -21027,6 +27539,12 @@ define amdgpu_kernel void @private_singlethread_one_as_release_acquire_ret_cmpxc ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -21049,6 +27567,12 @@ define amdgpu_kernel void @private_singlethread_one_as_release_acquire_ret_cmpxc ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -21068,6 +27592,12 @@ define amdgpu_kernel void @private_singlethread_one_as_release_acquire_ret_cmpxc ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -21092,6 +27622,12 @@ define amdgpu_kernel void @private_singlethread_one_as_release_acquire_ret_cmpxc ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -21114,6 +27650,12 @@ define amdgpu_kernel void @private_singlethread_one_as_release_acquire_ret_cmpxc ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21134,6 +27676,12 @@ define amdgpu_kernel void @private_singlethread_one_as_release_acquire_ret_cmpxc ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21153,6 +27701,12 @@ define amdgpu_kernel void @private_singlethread_one_as_release_acquire_ret_cmpxc ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -21173,6 +27727,12 @@ define amdgpu_kernel void @private_singlethread_one_as_release_acquire_ret_cmpxc ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -21192,6 +27752,12 @@ define amdgpu_kernel void @private_singlethread_one_as_release_acquire_ret_cmpxc ; GFX11-WGP-LABEL: private_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -21208,6 +27774,12 @@ define amdgpu_kernel void @private_singlethread_one_as_release_acquire_ret_cmpxc ; GFX11-CU-LABEL: private_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -21224,6 +27796,12 @@ define amdgpu_kernel void @private_singlethread_one_as_release_acquire_ret_cmpxc ; GFX12-WGP-LABEL: private_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -21239,6 +27817,12 @@ define amdgpu_kernel void @private_singlethread_one_as_release_acquire_ret_cmpxc ; GFX12-CU-LABEL: private_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -21255,25 +27839,31 @@ define amdgpu_kernel void @private_singlethread_one_as_release_acquire_ret_cmpxc ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -21305,6 +27895,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_acquire_ret_cmpxc ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -21329,6 +27925,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_acquire_ret_cmpxc ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -21351,6 +27953,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_acquire_ret_cmpxc ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -21370,6 +27978,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_acquire_ret_cmpxc ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -21394,6 +28008,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_acquire_ret_cmpxc ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -21416,6 +28036,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_acquire_ret_cmpxc ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21436,6 +28062,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_acquire_ret_cmpxc ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21455,6 +28087,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_acquire_ret_cmpxc ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -21475,6 +28113,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_acquire_ret_cmpxc ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -21494,6 +28138,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_acquire_ret_cmpxc ; GFX11-WGP-LABEL: private_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -21510,6 +28160,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_acquire_ret_cmpxc ; GFX11-CU-LABEL: private_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -21526,6 +28182,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_acquire_ret_cmpxc ; GFX12-WGP-LABEL: private_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -21541,6 +28203,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_acquire_ret_cmpxc ; GFX12-CU-LABEL: private_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -21557,25 +28225,31 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_acquire_ret_cmpxc ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -21607,6 +28281,12 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_acquire_ret_cmpxc ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -21631,6 +28311,12 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_acquire_ret_cmpxc ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -21653,6 +28339,12 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_acquire_ret_cmpxc ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -21672,6 +28364,12 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_acquire_ret_cmpxc ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -21696,6 +28394,12 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_acquire_ret_cmpxc ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -21718,6 +28422,12 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_acquire_ret_cmpxc ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21738,6 +28448,12 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_acquire_ret_cmpxc ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21757,6 +28473,12 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_acquire_ret_cmpxc ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -21777,6 +28499,12 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_acquire_ret_cmpxc ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -21796,6 +28524,12 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_acquire_ret_cmpxc ; GFX11-WGP-LABEL: private_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -21812,6 +28546,12 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_acquire_ret_cmpxc ; GFX11-CU-LABEL: private_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -21828,6 +28568,12 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_acquire_ret_cmpxc ; GFX12-WGP-LABEL: private_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -21843,6 +28589,12 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_acquire_ret_cmpxc ; GFX12-CU-LABEL: private_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -21859,25 +28611,31 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_acquire_ret_cmpxc ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -21909,6 +28667,12 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_seq_cst_ret_cmp ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -21933,6 +28697,12 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_seq_cst_ret_cmp ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -21955,6 +28725,12 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_seq_cst_ret_cmp ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -21974,6 +28750,12 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_seq_cst_ret_cmp ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -21998,6 +28780,12 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_seq_cst_ret_cmp ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -22020,6 +28808,12 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_seq_cst_ret_cmp ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -22040,6 +28834,12 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_seq_cst_ret_cmp ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -22059,6 +28859,12 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_seq_cst_ret_cmp ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -22079,6 +28885,12 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_seq_cst_ret_cmp ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -22098,6 +28910,12 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_seq_cst_ret_cmp ; GFX11-WGP-LABEL: private_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -22114,6 +28932,12 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_seq_cst_ret_cmp ; GFX11-CU-LABEL: private_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -22130,6 +28954,12 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_seq_cst_ret_cmp ; GFX12-WGP-LABEL: private_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -22145,6 +28975,12 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_seq_cst_ret_cmp ; GFX12-CU-LABEL: private_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -22161,25 +28997,31 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_seq_cst_ret_cmp ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -22211,6 +29053,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_seq_cst_ret_cmpxc ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -22235,6 +29083,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_seq_cst_ret_cmpxc ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -22257,6 +29111,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_seq_cst_ret_cmpxc ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -22276,6 +29136,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_seq_cst_ret_cmpxc ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -22300,6 +29166,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_seq_cst_ret_cmpxc ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -22322,6 +29194,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_seq_cst_ret_cmpxc ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -22342,6 +29220,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_seq_cst_ret_cmpxc ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -22361,6 +29245,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_seq_cst_ret_cmpxc ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -22381,6 +29271,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_seq_cst_ret_cmpxc ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -22400,6 +29296,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_seq_cst_ret_cmpxc ; GFX11-WGP-LABEL: private_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -22416,6 +29318,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_seq_cst_ret_cmpxc ; GFX11-CU-LABEL: private_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -22432,6 +29340,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_seq_cst_ret_cmpxc ; GFX12-WGP-LABEL: private_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -22447,6 +29361,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_seq_cst_ret_cmpxc ; GFX12-CU-LABEL: private_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -22463,25 +29383,31 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_seq_cst_ret_cmpxc ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -22513,6 +29439,12 @@ define amdgpu_kernel void @private_singlethread_one_as_release_seq_cst_ret_cmpxc ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -22537,6 +29469,12 @@ define amdgpu_kernel void @private_singlethread_one_as_release_seq_cst_ret_cmpxc ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -22559,6 +29497,12 @@ define amdgpu_kernel void @private_singlethread_one_as_release_seq_cst_ret_cmpxc ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -22578,6 +29522,12 @@ define amdgpu_kernel void @private_singlethread_one_as_release_seq_cst_ret_cmpxc ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -22602,6 +29552,12 @@ define amdgpu_kernel void @private_singlethread_one_as_release_seq_cst_ret_cmpxc ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -22624,6 +29580,12 @@ define amdgpu_kernel void @private_singlethread_one_as_release_seq_cst_ret_cmpxc ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -22644,6 +29606,12 @@ define amdgpu_kernel void @private_singlethread_one_as_release_seq_cst_ret_cmpxc ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -22663,6 +29631,12 @@ define amdgpu_kernel void @private_singlethread_one_as_release_seq_cst_ret_cmpxc ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -22683,6 +29657,12 @@ define amdgpu_kernel void @private_singlethread_one_as_release_seq_cst_ret_cmpxc ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -22702,6 +29682,12 @@ define amdgpu_kernel void @private_singlethread_one_as_release_seq_cst_ret_cmpxc ; GFX11-WGP-LABEL: private_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -22718,6 +29704,12 @@ define amdgpu_kernel void @private_singlethread_one_as_release_seq_cst_ret_cmpxc ; GFX11-CU-LABEL: private_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -22734,6 +29726,12 @@ define amdgpu_kernel void @private_singlethread_one_as_release_seq_cst_ret_cmpxc ; GFX12-WGP-LABEL: private_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -22749,6 +29747,12 @@ define amdgpu_kernel void @private_singlethread_one_as_release_seq_cst_ret_cmpxc ; GFX12-CU-LABEL: private_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -22765,25 +29769,31 @@ define amdgpu_kernel void @private_singlethread_one_as_release_seq_cst_ret_cmpxc ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -22815,6 +29825,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_seq_cst_ret_cmpxc ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -22839,6 +29855,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_seq_cst_ret_cmpxc ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -22861,6 +29883,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_seq_cst_ret_cmpxc ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -22880,6 +29908,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_seq_cst_ret_cmpxc ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -22904,6 +29938,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_seq_cst_ret_cmpxc ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -22926,6 +29966,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_seq_cst_ret_cmpxc ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -22946,6 +29992,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_seq_cst_ret_cmpxc ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -22965,6 +30017,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_seq_cst_ret_cmpxc ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -22985,6 +30043,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_seq_cst_ret_cmpxc ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -23004,6 +30068,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_seq_cst_ret_cmpxc ; GFX11-WGP-LABEL: private_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -23020,6 +30090,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_seq_cst_ret_cmpxc ; GFX11-CU-LABEL: private_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -23036,6 +30112,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_seq_cst_ret_cmpxc ; GFX12-WGP-LABEL: private_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -23051,6 +30133,12 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_seq_cst_ret_cmpxc ; GFX12-CU-LABEL: private_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -23067,25 +30155,31 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_seq_cst_ret_cmpxc ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -23117,6 +30211,12 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_seq_cst_ret_cmpxc ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -23141,6 +30241,12 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_seq_cst_ret_cmpxc ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -23163,6 +30269,12 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_seq_cst_ret_cmpxc ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -23182,6 +30294,12 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_seq_cst_ret_cmpxc ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -23206,6 +30324,12 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_seq_cst_ret_cmpxc ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -23228,6 +30352,12 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_seq_cst_ret_cmpxc ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -23248,6 +30378,12 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_seq_cst_ret_cmpxc ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -23267,6 +30403,12 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_seq_cst_ret_cmpxc ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -23287,6 +30429,12 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_seq_cst_ret_cmpxc ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -23306,6 +30454,12 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_seq_cst_ret_cmpxc ; GFX11-WGP-LABEL: private_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -23322,6 +30476,12 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_seq_cst_ret_cmpxc ; GFX11-CU-LABEL: private_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -23338,6 +30498,12 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_seq_cst_ret_cmpxc ; GFX12-WGP-LABEL: private_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -23353,6 +30519,12 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_seq_cst_ret_cmpxc ; GFX12-CU-LABEL: private_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -23369,25 +30541,31 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_seq_cst_ret_cmpxc ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-system.ll index 5940897e1c80..5a5dbffa6114 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-system.ll @@ -19,11 +19,15 @@ define amdgpu_kernel void @private_system_unordered_load( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -33,11 +37,15 @@ define amdgpu_kernel void @private_system_unordered_load( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -47,11 +55,15 @@ define amdgpu_kernel void @private_system_unordered_load( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -61,11 +73,15 @@ define amdgpu_kernel void @private_system_unordered_load( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -79,11 +95,15 @@ define amdgpu_kernel void @private_system_unordered_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -93,11 +113,15 @@ define amdgpu_kernel void @private_system_unordered_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -107,11 +131,15 @@ define amdgpu_kernel void @private_system_unordered_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -119,84 +147,108 @@ define amdgpu_kernel void @private_system_unordered_load( ; ; GFX942-NOTTGSPLIT-LABEL: private_system_unordered_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_system_unordered_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_system_unordered_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_system_unordered_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_system_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_system_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: private_system_unordered_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -205,6 +257,7 @@ define amdgpu_kernel void @private_system_unordered_load( ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: flat_load_b32 v0, v[0:1] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %in, ptr addrspace(5) %out) { @@ -219,11 +272,15 @@ define amdgpu_kernel void @private_system_monotonic_load( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -233,11 +290,15 @@ define amdgpu_kernel void @private_system_monotonic_load( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -247,11 +308,15 @@ define amdgpu_kernel void @private_system_monotonic_load( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -261,11 +326,15 @@ define amdgpu_kernel void @private_system_monotonic_load( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -279,11 +348,15 @@ define amdgpu_kernel void @private_system_monotonic_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -293,11 +366,15 @@ define amdgpu_kernel void @private_system_monotonic_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -307,11 +384,15 @@ define amdgpu_kernel void @private_system_monotonic_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -319,84 +400,108 @@ define amdgpu_kernel void @private_system_monotonic_load( ; ; GFX942-NOTTGSPLIT-LABEL: private_system_monotonic_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_system_monotonic_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_system_monotonic_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_system_monotonic_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_system_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_system_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: private_system_monotonic_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -405,6 +510,7 @@ define amdgpu_kernel void @private_system_monotonic_load( ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %in, ptr addrspace(5) %out) { @@ -419,11 +525,15 @@ define amdgpu_kernel void @private_system_acquire_load( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -433,11 +543,15 @@ define amdgpu_kernel void @private_system_acquire_load( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -447,11 +561,15 @@ define amdgpu_kernel void @private_system_acquire_load( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -461,11 +579,15 @@ define amdgpu_kernel void @private_system_acquire_load( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -479,11 +601,15 @@ define amdgpu_kernel void @private_system_acquire_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -493,11 +619,15 @@ define amdgpu_kernel void @private_system_acquire_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -507,11 +637,15 @@ define amdgpu_kernel void @private_system_acquire_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -519,84 +653,108 @@ define amdgpu_kernel void @private_system_acquire_load( ; ; GFX942-NOTTGSPLIT-LABEL: private_system_acquire_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_system_acquire_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_system_acquire_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_system_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_system_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_system_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: private_system_acquire_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -607,6 +765,7 @@ define amdgpu_kernel void @private_system_acquire_load( ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %in, ptr addrspace(5) %out) { @@ -621,11 +780,15 @@ define amdgpu_kernel void @private_system_seq_cst_load( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -635,11 +798,15 @@ define amdgpu_kernel void @private_system_seq_cst_load( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -649,11 +816,15 @@ define amdgpu_kernel void @private_system_seq_cst_load( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -663,11 +834,15 @@ define amdgpu_kernel void @private_system_seq_cst_load( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -681,11 +856,15 @@ define amdgpu_kernel void @private_system_seq_cst_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -695,11 +874,15 @@ define amdgpu_kernel void @private_system_seq_cst_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -709,11 +892,15 @@ define amdgpu_kernel void @private_system_seq_cst_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -721,84 +908,108 @@ define amdgpu_kernel void @private_system_seq_cst_load( ; ; GFX942-NOTTGSPLIT-LABEL: private_system_seq_cst_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_system_seq_cst_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_system_seq_cst_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_system_seq_cst_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_system_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_system_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: private_system_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -811,6 +1022,7 @@ define amdgpu_kernel void @private_system_seq_cst_load( ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %in, ptr addrspace(5) %out) { @@ -825,10 +1037,14 @@ define amdgpu_kernel void @private_system_unordered_store( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX6-NEXT: s_endpgm @@ -837,10 +1053,14 @@ define amdgpu_kernel void @private_system_unordered_store( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX7-NEXT: s_endpgm @@ -849,10 +1069,14 @@ define amdgpu_kernel void @private_system_unordered_store( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-WGP-NEXT: s_endpgm @@ -861,10 +1085,14 @@ define amdgpu_kernel void @private_system_unordered_store( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-CU-NEXT: s_endpgm @@ -877,10 +1105,14 @@ define amdgpu_kernel void @private_system_unordered_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -889,10 +1121,14 @@ define amdgpu_kernel void @private_system_unordered_store( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -901,65 +1137,93 @@ define amdgpu_kernel void @private_system_unordered_store( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: private_system_unordered_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_system_unordered_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_system_unordered_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_system_unordered_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_system_unordered_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_system_unordered_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; @@ -967,22 +1231,26 @@ define amdgpu_kernel void @private_system_unordered_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -1004,10 +1272,14 @@ define amdgpu_kernel void @private_system_monotonic_store( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX6-NEXT: s_endpgm @@ -1016,10 +1288,14 @@ define amdgpu_kernel void @private_system_monotonic_store( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX7-NEXT: s_endpgm @@ -1028,10 +1304,14 @@ define amdgpu_kernel void @private_system_monotonic_store( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-WGP-NEXT: s_endpgm @@ -1040,10 +1320,14 @@ define amdgpu_kernel void @private_system_monotonic_store( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-CU-NEXT: s_endpgm @@ -1056,10 +1340,14 @@ define amdgpu_kernel void @private_system_monotonic_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -1068,10 +1356,14 @@ define amdgpu_kernel void @private_system_monotonic_store( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -1080,65 +1372,93 @@ define amdgpu_kernel void @private_system_monotonic_store( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: private_system_monotonic_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_system_monotonic_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_system_monotonic_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_system_monotonic_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_system_monotonic_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_system_monotonic_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; @@ -1146,22 +1466,26 @@ define amdgpu_kernel void @private_system_monotonic_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -1183,10 +1507,14 @@ define amdgpu_kernel void @private_system_release_store( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX6-NEXT: s_endpgm @@ -1195,10 +1523,14 @@ define amdgpu_kernel void @private_system_release_store( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX7-NEXT: s_endpgm @@ -1207,10 +1539,14 @@ define amdgpu_kernel void @private_system_release_store( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-WGP-NEXT: s_endpgm @@ -1219,10 +1555,14 @@ define amdgpu_kernel void @private_system_release_store( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-CU-NEXT: s_endpgm @@ -1235,10 +1575,14 @@ define amdgpu_kernel void @private_system_release_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -1247,10 +1591,14 @@ define amdgpu_kernel void @private_system_release_store( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -1259,65 +1607,93 @@ define amdgpu_kernel void @private_system_release_store( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: private_system_release_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_system_release_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_system_release_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_system_release_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_system_release_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_system_release_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; @@ -1325,22 +1701,26 @@ define amdgpu_kernel void @private_system_release_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -1367,10 +1747,14 @@ define amdgpu_kernel void @private_system_seq_cst_store( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX6-NEXT: s_endpgm @@ -1379,10 +1763,14 @@ define amdgpu_kernel void @private_system_seq_cst_store( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX7-NEXT: s_endpgm @@ -1391,10 +1779,14 @@ define amdgpu_kernel void @private_system_seq_cst_store( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-WGP-NEXT: s_endpgm @@ -1403,10 +1795,14 @@ define amdgpu_kernel void @private_system_seq_cst_store( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-CU-NEXT: s_endpgm @@ -1419,10 +1815,14 @@ define amdgpu_kernel void @private_system_seq_cst_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -1431,10 +1831,14 @@ define amdgpu_kernel void @private_system_seq_cst_store( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -1443,65 +1847,93 @@ define amdgpu_kernel void @private_system_seq_cst_store( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: private_system_seq_cst_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_system_seq_cst_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_system_seq_cst_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_system_seq_cst_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_system_seq_cst_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_system_seq_cst_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; @@ -1509,22 +1941,26 @@ define amdgpu_kernel void @private_system_seq_cst_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -1552,8 +1988,15 @@ define amdgpu_kernel void @private_system_monotonic_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1564,8 +2007,15 @@ define amdgpu_kernel void @private_system_monotonic_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1576,8 +2026,15 @@ define amdgpu_kernel void @private_system_monotonic_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1588,8 +2045,15 @@ define amdgpu_kernel void @private_system_monotonic_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1604,8 +2068,15 @@ define amdgpu_kernel void @private_system_monotonic_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -1616,8 +2087,15 @@ define amdgpu_kernel void @private_system_monotonic_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1628,8 +2106,15 @@ define amdgpu_kernel void @private_system_monotonic_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1638,8 +2123,14 @@ define amdgpu_kernel void @private_system_monotonic_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_system_monotonic_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -1647,8 +2138,14 @@ define amdgpu_kernel void @private_system_monotonic_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_system_monotonic_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm @@ -1656,8 +2153,14 @@ define amdgpu_kernel void @private_system_monotonic_atomicrmw( ; GFX11-WGP-LABEL: private_system_monotonic_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm @@ -1665,8 +2168,14 @@ define amdgpu_kernel void @private_system_monotonic_atomicrmw( ; GFX11-CU-LABEL: private_system_monotonic_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm @@ -1674,8 +2183,14 @@ define amdgpu_kernel void @private_system_monotonic_atomicrmw( ; GFX12-WGP-LABEL: private_system_monotonic_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm @@ -1683,8 +2198,14 @@ define amdgpu_kernel void @private_system_monotonic_atomicrmw( ; GFX12-CU-LABEL: private_system_monotonic_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm @@ -1692,29 +2213,34 @@ define amdgpu_kernel void @private_system_monotonic_atomicrmw( ; GFX1250-LABEL: private_system_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 ; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS @@ -1731,8 +2257,15 @@ define amdgpu_kernel void @private_system_acquire_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1743,8 +2276,15 @@ define amdgpu_kernel void @private_system_acquire_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1755,8 +2295,15 @@ define amdgpu_kernel void @private_system_acquire_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1767,8 +2314,15 @@ define amdgpu_kernel void @private_system_acquire_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1783,8 +2337,15 @@ define amdgpu_kernel void @private_system_acquire_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -1795,8 +2356,15 @@ define amdgpu_kernel void @private_system_acquire_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1807,8 +2375,15 @@ define amdgpu_kernel void @private_system_acquire_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1817,8 +2392,14 @@ define amdgpu_kernel void @private_system_acquire_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_system_acquire_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -1826,8 +2407,14 @@ define amdgpu_kernel void @private_system_acquire_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_system_acquire_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm @@ -1835,8 +2422,14 @@ define amdgpu_kernel void @private_system_acquire_atomicrmw( ; GFX11-WGP-LABEL: private_system_acquire_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm @@ -1844,8 +2437,14 @@ define amdgpu_kernel void @private_system_acquire_atomicrmw( ; GFX11-CU-LABEL: private_system_acquire_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm @@ -1853,8 +2452,14 @@ define amdgpu_kernel void @private_system_acquire_atomicrmw( ; GFX12-WGP-LABEL: private_system_acquire_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm @@ -1862,8 +2467,14 @@ define amdgpu_kernel void @private_system_acquire_atomicrmw( ; GFX12-CU-LABEL: private_system_acquire_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm @@ -1871,29 +2482,34 @@ define amdgpu_kernel void @private_system_acquire_atomicrmw( ; GFX1250-LABEL: private_system_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 ; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS @@ -1913,8 +2529,15 @@ define amdgpu_kernel void @private_system_release_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1925,8 +2548,15 @@ define amdgpu_kernel void @private_system_release_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1937,8 +2567,15 @@ define amdgpu_kernel void @private_system_release_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1949,8 +2586,15 @@ define amdgpu_kernel void @private_system_release_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1965,8 +2609,15 @@ define amdgpu_kernel void @private_system_release_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -1977,8 +2628,15 @@ define amdgpu_kernel void @private_system_release_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1989,8 +2647,15 @@ define amdgpu_kernel void @private_system_release_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1999,8 +2664,14 @@ define amdgpu_kernel void @private_system_release_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_system_release_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -2008,8 +2679,14 @@ define amdgpu_kernel void @private_system_release_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_system_release_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm @@ -2017,8 +2694,14 @@ define amdgpu_kernel void @private_system_release_atomicrmw( ; GFX11-WGP-LABEL: private_system_release_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm @@ -2026,8 +2709,14 @@ define amdgpu_kernel void @private_system_release_atomicrmw( ; GFX11-CU-LABEL: private_system_release_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm @@ -2035,8 +2724,14 @@ define amdgpu_kernel void @private_system_release_atomicrmw( ; GFX12-WGP-LABEL: private_system_release_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm @@ -2044,8 +2739,14 @@ define amdgpu_kernel void @private_system_release_atomicrmw( ; GFX12-CU-LABEL: private_system_release_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm @@ -2053,29 +2754,34 @@ define amdgpu_kernel void @private_system_release_atomicrmw( ; GFX1250-LABEL: private_system_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 ; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -2097,8 +2803,15 @@ define amdgpu_kernel void @private_system_acq_rel_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -2109,8 +2822,15 @@ define amdgpu_kernel void @private_system_acq_rel_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -2121,8 +2841,15 @@ define amdgpu_kernel void @private_system_acq_rel_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -2133,8 +2860,15 @@ define amdgpu_kernel void @private_system_acq_rel_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -2149,8 +2883,15 @@ define amdgpu_kernel void @private_system_acq_rel_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -2161,8 +2902,15 @@ define amdgpu_kernel void @private_system_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -2173,8 +2921,15 @@ define amdgpu_kernel void @private_system_acq_rel_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -2183,8 +2938,14 @@ define amdgpu_kernel void @private_system_acq_rel_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_system_acq_rel_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -2192,8 +2953,14 @@ define amdgpu_kernel void @private_system_acq_rel_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_system_acq_rel_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm @@ -2201,8 +2968,14 @@ define amdgpu_kernel void @private_system_acq_rel_atomicrmw( ; GFX11-WGP-LABEL: private_system_acq_rel_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm @@ -2210,8 +2983,14 @@ define amdgpu_kernel void @private_system_acq_rel_atomicrmw( ; GFX11-CU-LABEL: private_system_acq_rel_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm @@ -2219,8 +2998,14 @@ define amdgpu_kernel void @private_system_acq_rel_atomicrmw( ; GFX12-WGP-LABEL: private_system_acq_rel_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm @@ -2228,8 +3013,14 @@ define amdgpu_kernel void @private_system_acq_rel_atomicrmw( ; GFX12-CU-LABEL: private_system_acq_rel_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm @@ -2237,29 +3028,34 @@ define amdgpu_kernel void @private_system_acq_rel_atomicrmw( ; GFX1250-LABEL: private_system_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 ; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -2284,8 +3080,15 @@ define amdgpu_kernel void @private_system_seq_cst_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -2296,8 +3099,15 @@ define amdgpu_kernel void @private_system_seq_cst_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -2308,8 +3118,15 @@ define amdgpu_kernel void @private_system_seq_cst_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -2320,8 +3137,15 @@ define amdgpu_kernel void @private_system_seq_cst_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -2336,8 +3160,15 @@ define amdgpu_kernel void @private_system_seq_cst_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -2348,8 +3179,15 @@ define amdgpu_kernel void @private_system_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -2360,8 +3198,15 @@ define amdgpu_kernel void @private_system_seq_cst_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -2370,8 +3215,14 @@ define amdgpu_kernel void @private_system_seq_cst_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_system_seq_cst_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -2379,8 +3230,14 @@ define amdgpu_kernel void @private_system_seq_cst_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_system_seq_cst_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm @@ -2388,8 +3245,14 @@ define amdgpu_kernel void @private_system_seq_cst_atomicrmw( ; GFX11-WGP-LABEL: private_system_seq_cst_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm @@ -2397,8 +3260,14 @@ define amdgpu_kernel void @private_system_seq_cst_atomicrmw( ; GFX11-CU-LABEL: private_system_seq_cst_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm @@ -2406,8 +3275,14 @@ define amdgpu_kernel void @private_system_seq_cst_atomicrmw( ; GFX12-WGP-LABEL: private_system_seq_cst_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm @@ -2415,8 +3290,14 @@ define amdgpu_kernel void @private_system_seq_cst_atomicrmw( ; GFX12-CU-LABEL: private_system_seq_cst_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm @@ -2424,29 +3305,34 @@ define amdgpu_kernel void @private_system_seq_cst_atomicrmw( ; GFX1250-LABEL: private_system_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 ; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -2471,6 +3357,10 @@ define amdgpu_kernel void @private_system_acquire_ret_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -2489,6 +3379,10 @@ define amdgpu_kernel void @private_system_acquire_ret_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -2506,6 +3400,10 @@ define amdgpu_kernel void @private_system_acquire_ret_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -2523,6 +3421,10 @@ define amdgpu_kernel void @private_system_acquire_ret_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -2544,6 +3446,10 @@ define amdgpu_kernel void @private_system_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -2561,6 +3467,10 @@ define amdgpu_kernel void @private_system_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -2578,6 +3488,10 @@ define amdgpu_kernel void @private_system_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -2593,6 +3507,10 @@ define amdgpu_kernel void @private_system_acquire_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_system_acquire_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 @@ -2605,6 +3523,10 @@ define amdgpu_kernel void @private_system_acquire_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_system_acquire_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 @@ -2617,6 +3539,10 @@ define amdgpu_kernel void @private_system_acquire_ret_atomicrmw( ; GFX11-WGP-LABEL: private_system_acquire_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 @@ -2629,6 +3555,10 @@ define amdgpu_kernel void @private_system_acquire_ret_atomicrmw( ; GFX11-CU-LABEL: private_system_acquire_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 @@ -2641,6 +3571,10 @@ define amdgpu_kernel void @private_system_acquire_ret_atomicrmw( ; GFX12-WGP-LABEL: private_system_acquire_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 @@ -2653,6 +3587,10 @@ define amdgpu_kernel void @private_system_acquire_ret_atomicrmw( ; GFX12-CU-LABEL: private_system_acquire_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 @@ -2666,22 +3604,26 @@ define amdgpu_kernel void @private_system_acquire_ret_atomicrmw( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_lg_u32 s0, s2 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b32 s2, 0 ; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s0, s3 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -2709,6 +3651,10 @@ define amdgpu_kernel void @private_system_acq_rel_ret_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -2727,6 +3673,10 @@ define amdgpu_kernel void @private_system_acq_rel_ret_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -2744,6 +3694,10 @@ define amdgpu_kernel void @private_system_acq_rel_ret_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -2761,6 +3715,10 @@ define amdgpu_kernel void @private_system_acq_rel_ret_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -2782,6 +3740,10 @@ define amdgpu_kernel void @private_system_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -2799,6 +3761,10 @@ define amdgpu_kernel void @private_system_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -2816,6 +3782,10 @@ define amdgpu_kernel void @private_system_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -2831,6 +3801,10 @@ define amdgpu_kernel void @private_system_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_system_acq_rel_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 @@ -2843,6 +3817,10 @@ define amdgpu_kernel void @private_system_acq_rel_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_system_acq_rel_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 @@ -2855,6 +3833,10 @@ define amdgpu_kernel void @private_system_acq_rel_ret_atomicrmw( ; GFX11-WGP-LABEL: private_system_acq_rel_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 @@ -2867,6 +3849,10 @@ define amdgpu_kernel void @private_system_acq_rel_ret_atomicrmw( ; GFX11-CU-LABEL: private_system_acq_rel_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 @@ -2879,6 +3865,10 @@ define amdgpu_kernel void @private_system_acq_rel_ret_atomicrmw( ; GFX12-WGP-LABEL: private_system_acq_rel_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 @@ -2891,6 +3881,10 @@ define amdgpu_kernel void @private_system_acq_rel_ret_atomicrmw( ; GFX12-CU-LABEL: private_system_acq_rel_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 @@ -2904,22 +3898,26 @@ define amdgpu_kernel void @private_system_acq_rel_ret_atomicrmw( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_lg_u32 s0, s2 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b32 s2, 0 ; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s0, s3 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -2952,6 +3950,10 @@ define amdgpu_kernel void @private_system_seq_cst_ret_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -2970,6 +3972,10 @@ define amdgpu_kernel void @private_system_seq_cst_ret_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -2987,6 +3993,10 @@ define amdgpu_kernel void @private_system_seq_cst_ret_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -3004,6 +4014,10 @@ define amdgpu_kernel void @private_system_seq_cst_ret_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -3025,6 +4039,10 @@ define amdgpu_kernel void @private_system_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -3042,6 +4060,10 @@ define amdgpu_kernel void @private_system_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -3059,6 +4081,10 @@ define amdgpu_kernel void @private_system_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -3074,6 +4100,10 @@ define amdgpu_kernel void @private_system_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_system_seq_cst_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 @@ -3086,6 +4116,10 @@ define amdgpu_kernel void @private_system_seq_cst_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_system_seq_cst_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 @@ -3098,6 +4132,10 @@ define amdgpu_kernel void @private_system_seq_cst_ret_atomicrmw( ; GFX11-WGP-LABEL: private_system_seq_cst_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 @@ -3110,6 +4148,10 @@ define amdgpu_kernel void @private_system_seq_cst_ret_atomicrmw( ; GFX11-CU-LABEL: private_system_seq_cst_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 @@ -3122,6 +4164,10 @@ define amdgpu_kernel void @private_system_seq_cst_ret_atomicrmw( ; GFX12-WGP-LABEL: private_system_seq_cst_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 @@ -3134,6 +4180,10 @@ define amdgpu_kernel void @private_system_seq_cst_ret_atomicrmw( ; GFX12-CU-LABEL: private_system_seq_cst_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 @@ -3147,22 +4197,26 @@ define amdgpu_kernel void @private_system_seq_cst_ret_atomicrmw( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_lg_u32 s0, s2 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b32 s2, 0 ; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s0, s3 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -3195,6 +4249,12 @@ define amdgpu_kernel void @private_system_monotonic_monotonic_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -3215,6 +4275,12 @@ define amdgpu_kernel void @private_system_monotonic_monotonic_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -3235,6 +4301,12 @@ define amdgpu_kernel void @private_system_monotonic_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -3252,6 +4324,12 @@ define amdgpu_kernel void @private_system_monotonic_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -3273,6 +4351,12 @@ define amdgpu_kernel void @private_system_monotonic_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -3293,6 +4377,12 @@ define amdgpu_kernel void @private_system_monotonic_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3311,6 +4401,12 @@ define amdgpu_kernel void @private_system_monotonic_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3327,6 +4423,12 @@ define amdgpu_kernel void @private_system_monotonic_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_system_monotonic_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -3344,6 +4446,12 @@ define amdgpu_kernel void @private_system_monotonic_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_system_monotonic_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -3361,6 +4469,12 @@ define amdgpu_kernel void @private_system_monotonic_monotonic_cmpxchg( ; GFX11-WGP-LABEL: private_system_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -3376,6 +4490,12 @@ define amdgpu_kernel void @private_system_monotonic_monotonic_cmpxchg( ; GFX11-CU-LABEL: private_system_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -3391,6 +4511,12 @@ define amdgpu_kernel void @private_system_monotonic_monotonic_cmpxchg( ; GFX12-WGP-LABEL: private_system_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -3405,6 +4531,12 @@ define amdgpu_kernel void @private_system_monotonic_monotonic_cmpxchg( ; GFX12-CU-LABEL: private_system_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -3419,26 +4551,32 @@ define amdgpu_kernel void @private_system_monotonic_monotonic_cmpxchg( ; GFX1250-LABEL: private_system_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -3446,6 +4584,7 @@ define amdgpu_kernel void @private_system_monotonic_monotonic_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -3465,6 +4604,12 @@ define amdgpu_kernel void @private_system_acquire_monotonic_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -3485,6 +4630,12 @@ define amdgpu_kernel void @private_system_acquire_monotonic_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -3505,6 +4656,12 @@ define amdgpu_kernel void @private_system_acquire_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -3522,6 +4679,12 @@ define amdgpu_kernel void @private_system_acquire_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -3543,6 +4706,12 @@ define amdgpu_kernel void @private_system_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -3563,6 +4732,12 @@ define amdgpu_kernel void @private_system_acquire_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3581,6 +4756,12 @@ define amdgpu_kernel void @private_system_acquire_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3597,6 +4778,12 @@ define amdgpu_kernel void @private_system_acquire_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_system_acquire_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -3614,6 +4801,12 @@ define amdgpu_kernel void @private_system_acquire_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_system_acquire_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -3631,6 +4824,12 @@ define amdgpu_kernel void @private_system_acquire_monotonic_cmpxchg( ; GFX11-WGP-LABEL: private_system_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -3646,6 +4845,12 @@ define amdgpu_kernel void @private_system_acquire_monotonic_cmpxchg( ; GFX11-CU-LABEL: private_system_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -3661,6 +4866,12 @@ define amdgpu_kernel void @private_system_acquire_monotonic_cmpxchg( ; GFX12-WGP-LABEL: private_system_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -3675,6 +4886,12 @@ define amdgpu_kernel void @private_system_acquire_monotonic_cmpxchg( ; GFX12-CU-LABEL: private_system_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -3689,26 +4906,32 @@ define amdgpu_kernel void @private_system_acquire_monotonic_cmpxchg( ; GFX1250-LABEL: private_system_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -3716,6 +4939,7 @@ define amdgpu_kernel void @private_system_acquire_monotonic_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -3738,6 +4962,12 @@ define amdgpu_kernel void @private_system_release_monotonic_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -3758,6 +4988,12 @@ define amdgpu_kernel void @private_system_release_monotonic_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -3778,6 +5014,12 @@ define amdgpu_kernel void @private_system_release_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -3795,6 +5037,12 @@ define amdgpu_kernel void @private_system_release_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -3816,6 +5064,12 @@ define amdgpu_kernel void @private_system_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -3836,6 +5090,12 @@ define amdgpu_kernel void @private_system_release_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3854,6 +5114,12 @@ define amdgpu_kernel void @private_system_release_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3870,6 +5136,12 @@ define amdgpu_kernel void @private_system_release_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_system_release_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -3887,6 +5159,12 @@ define amdgpu_kernel void @private_system_release_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_system_release_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -3904,6 +5182,12 @@ define amdgpu_kernel void @private_system_release_monotonic_cmpxchg( ; GFX11-WGP-LABEL: private_system_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -3919,6 +5203,12 @@ define amdgpu_kernel void @private_system_release_monotonic_cmpxchg( ; GFX11-CU-LABEL: private_system_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -3934,6 +5224,12 @@ define amdgpu_kernel void @private_system_release_monotonic_cmpxchg( ; GFX12-WGP-LABEL: private_system_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -3948,6 +5244,12 @@ define amdgpu_kernel void @private_system_release_monotonic_cmpxchg( ; GFX12-CU-LABEL: private_system_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -3962,26 +5264,32 @@ define amdgpu_kernel void @private_system_release_monotonic_cmpxchg( ; GFX1250-LABEL: private_system_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -3989,6 +5297,7 @@ define amdgpu_kernel void @private_system_release_monotonic_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -4013,6 +5322,12 @@ define amdgpu_kernel void @private_system_acq_rel_monotonic_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -4033,6 +5348,12 @@ define amdgpu_kernel void @private_system_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -4053,6 +5374,12 @@ define amdgpu_kernel void @private_system_acq_rel_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4070,6 +5397,12 @@ define amdgpu_kernel void @private_system_acq_rel_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4091,6 +5424,12 @@ define amdgpu_kernel void @private_system_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -4111,6 +5450,12 @@ define amdgpu_kernel void @private_system_acq_rel_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4129,6 +5474,12 @@ define amdgpu_kernel void @private_system_acq_rel_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4145,6 +5496,12 @@ define amdgpu_kernel void @private_system_acq_rel_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_system_acq_rel_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -4162,6 +5519,12 @@ define amdgpu_kernel void @private_system_acq_rel_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_system_acq_rel_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -4179,6 +5542,12 @@ define amdgpu_kernel void @private_system_acq_rel_monotonic_cmpxchg( ; GFX11-WGP-LABEL: private_system_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -4194,6 +5563,12 @@ define amdgpu_kernel void @private_system_acq_rel_monotonic_cmpxchg( ; GFX11-CU-LABEL: private_system_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -4209,6 +5584,12 @@ define amdgpu_kernel void @private_system_acq_rel_monotonic_cmpxchg( ; GFX12-WGP-LABEL: private_system_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -4223,6 +5604,12 @@ define amdgpu_kernel void @private_system_acq_rel_monotonic_cmpxchg( ; GFX12-CU-LABEL: private_system_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -4237,26 +5624,32 @@ define amdgpu_kernel void @private_system_acq_rel_monotonic_cmpxchg( ; GFX1250-LABEL: private_system_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -4264,6 +5657,7 @@ define amdgpu_kernel void @private_system_acq_rel_monotonic_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -4291,6 +5685,12 @@ define amdgpu_kernel void @private_system_seq_cst_monotonic_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -4311,6 +5711,12 @@ define amdgpu_kernel void @private_system_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -4331,6 +5737,12 @@ define amdgpu_kernel void @private_system_seq_cst_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4348,6 +5760,12 @@ define amdgpu_kernel void @private_system_seq_cst_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4369,6 +5787,12 @@ define amdgpu_kernel void @private_system_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -4389,6 +5813,12 @@ define amdgpu_kernel void @private_system_seq_cst_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4407,6 +5837,12 @@ define amdgpu_kernel void @private_system_seq_cst_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4423,6 +5859,12 @@ define amdgpu_kernel void @private_system_seq_cst_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_system_seq_cst_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -4440,6 +5882,12 @@ define amdgpu_kernel void @private_system_seq_cst_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_system_seq_cst_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -4457,6 +5905,12 @@ define amdgpu_kernel void @private_system_seq_cst_monotonic_cmpxchg( ; GFX11-WGP-LABEL: private_system_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -4472,6 +5926,12 @@ define amdgpu_kernel void @private_system_seq_cst_monotonic_cmpxchg( ; GFX11-CU-LABEL: private_system_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -4487,6 +5947,12 @@ define amdgpu_kernel void @private_system_seq_cst_monotonic_cmpxchg( ; GFX12-WGP-LABEL: private_system_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -4501,6 +5967,12 @@ define amdgpu_kernel void @private_system_seq_cst_monotonic_cmpxchg( ; GFX12-CU-LABEL: private_system_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -4515,26 +5987,32 @@ define amdgpu_kernel void @private_system_seq_cst_monotonic_cmpxchg( ; GFX1250-LABEL: private_system_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -4542,6 +6020,7 @@ define amdgpu_kernel void @private_system_seq_cst_monotonic_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -4569,6 +6048,12 @@ define amdgpu_kernel void @private_system_monotonic_acquire_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -4589,6 +6074,12 @@ define amdgpu_kernel void @private_system_monotonic_acquire_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -4609,6 +6100,12 @@ define amdgpu_kernel void @private_system_monotonic_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4626,6 +6123,12 @@ define amdgpu_kernel void @private_system_monotonic_acquire_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4647,6 +6150,12 @@ define amdgpu_kernel void @private_system_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -4667,6 +6176,12 @@ define amdgpu_kernel void @private_system_monotonic_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4685,6 +6200,12 @@ define amdgpu_kernel void @private_system_monotonic_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4701,6 +6222,12 @@ define amdgpu_kernel void @private_system_monotonic_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_system_monotonic_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -4718,6 +6245,12 @@ define amdgpu_kernel void @private_system_monotonic_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_system_monotonic_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -4735,6 +6268,12 @@ define amdgpu_kernel void @private_system_monotonic_acquire_cmpxchg( ; GFX11-WGP-LABEL: private_system_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -4750,6 +6289,12 @@ define amdgpu_kernel void @private_system_monotonic_acquire_cmpxchg( ; GFX11-CU-LABEL: private_system_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -4765,6 +6310,12 @@ define amdgpu_kernel void @private_system_monotonic_acquire_cmpxchg( ; GFX12-WGP-LABEL: private_system_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -4779,6 +6330,12 @@ define amdgpu_kernel void @private_system_monotonic_acquire_cmpxchg( ; GFX12-CU-LABEL: private_system_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -4793,26 +6350,32 @@ define amdgpu_kernel void @private_system_monotonic_acquire_cmpxchg( ; GFX1250-LABEL: private_system_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -4820,6 +6383,7 @@ define amdgpu_kernel void @private_system_monotonic_acquire_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -4842,6 +6406,12 @@ define amdgpu_kernel void @private_system_acquire_acquire_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -4862,6 +6432,12 @@ define amdgpu_kernel void @private_system_acquire_acquire_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -4882,6 +6458,12 @@ define amdgpu_kernel void @private_system_acquire_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4899,6 +6481,12 @@ define amdgpu_kernel void @private_system_acquire_acquire_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4920,6 +6508,12 @@ define amdgpu_kernel void @private_system_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -4940,6 +6534,12 @@ define amdgpu_kernel void @private_system_acquire_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4958,6 +6558,12 @@ define amdgpu_kernel void @private_system_acquire_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4974,6 +6580,12 @@ define amdgpu_kernel void @private_system_acquire_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_system_acquire_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -4991,6 +6603,12 @@ define amdgpu_kernel void @private_system_acquire_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_system_acquire_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -5008,6 +6626,12 @@ define amdgpu_kernel void @private_system_acquire_acquire_cmpxchg( ; GFX11-WGP-LABEL: private_system_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -5023,6 +6647,12 @@ define amdgpu_kernel void @private_system_acquire_acquire_cmpxchg( ; GFX11-CU-LABEL: private_system_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -5038,6 +6668,12 @@ define amdgpu_kernel void @private_system_acquire_acquire_cmpxchg( ; GFX12-WGP-LABEL: private_system_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -5052,6 +6688,12 @@ define amdgpu_kernel void @private_system_acquire_acquire_cmpxchg( ; GFX12-CU-LABEL: private_system_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -5066,26 +6708,32 @@ define amdgpu_kernel void @private_system_acquire_acquire_cmpxchg( ; GFX1250-LABEL: private_system_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -5093,6 +6741,7 @@ define amdgpu_kernel void @private_system_acquire_acquire_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -5115,6 +6764,12 @@ define amdgpu_kernel void @private_system_release_acquire_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -5135,6 +6790,12 @@ define amdgpu_kernel void @private_system_release_acquire_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -5155,6 +6816,12 @@ define amdgpu_kernel void @private_system_release_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5172,6 +6839,12 @@ define amdgpu_kernel void @private_system_release_acquire_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5193,6 +6866,12 @@ define amdgpu_kernel void @private_system_release_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -5213,6 +6892,12 @@ define amdgpu_kernel void @private_system_release_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5231,6 +6916,12 @@ define amdgpu_kernel void @private_system_release_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5247,6 +6938,12 @@ define amdgpu_kernel void @private_system_release_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_system_release_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -5264,6 +6961,12 @@ define amdgpu_kernel void @private_system_release_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_system_release_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -5281,6 +6984,12 @@ define amdgpu_kernel void @private_system_release_acquire_cmpxchg( ; GFX11-WGP-LABEL: private_system_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -5296,6 +7005,12 @@ define amdgpu_kernel void @private_system_release_acquire_cmpxchg( ; GFX11-CU-LABEL: private_system_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -5311,6 +7026,12 @@ define amdgpu_kernel void @private_system_release_acquire_cmpxchg( ; GFX12-WGP-LABEL: private_system_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -5325,6 +7046,12 @@ define amdgpu_kernel void @private_system_release_acquire_cmpxchg( ; GFX12-CU-LABEL: private_system_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -5339,26 +7066,32 @@ define amdgpu_kernel void @private_system_release_acquire_cmpxchg( ; GFX1250-LABEL: private_system_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -5366,6 +7099,7 @@ define amdgpu_kernel void @private_system_release_acquire_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -5393,6 +7127,12 @@ define amdgpu_kernel void @private_system_acq_rel_acquire_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -5413,6 +7153,12 @@ define amdgpu_kernel void @private_system_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -5433,6 +7179,12 @@ define amdgpu_kernel void @private_system_acq_rel_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5450,6 +7202,12 @@ define amdgpu_kernel void @private_system_acq_rel_acquire_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5471,6 +7229,12 @@ define amdgpu_kernel void @private_system_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -5491,6 +7255,12 @@ define amdgpu_kernel void @private_system_acq_rel_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5509,6 +7279,12 @@ define amdgpu_kernel void @private_system_acq_rel_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5525,6 +7301,12 @@ define amdgpu_kernel void @private_system_acq_rel_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_system_acq_rel_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -5542,6 +7324,12 @@ define amdgpu_kernel void @private_system_acq_rel_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_system_acq_rel_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -5559,6 +7347,12 @@ define amdgpu_kernel void @private_system_acq_rel_acquire_cmpxchg( ; GFX11-WGP-LABEL: private_system_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -5574,6 +7368,12 @@ define amdgpu_kernel void @private_system_acq_rel_acquire_cmpxchg( ; GFX11-CU-LABEL: private_system_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -5589,6 +7389,12 @@ define amdgpu_kernel void @private_system_acq_rel_acquire_cmpxchg( ; GFX12-WGP-LABEL: private_system_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -5603,6 +7409,12 @@ define amdgpu_kernel void @private_system_acq_rel_acquire_cmpxchg( ; GFX12-CU-LABEL: private_system_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -5617,26 +7429,32 @@ define amdgpu_kernel void @private_system_acq_rel_acquire_cmpxchg( ; GFX1250-LABEL: private_system_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -5644,6 +7462,7 @@ define amdgpu_kernel void @private_system_acq_rel_acquire_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -5671,6 +7490,12 @@ define amdgpu_kernel void @private_system_seq_cst_acquire_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -5691,6 +7516,12 @@ define amdgpu_kernel void @private_system_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -5711,6 +7542,12 @@ define amdgpu_kernel void @private_system_seq_cst_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5728,6 +7565,12 @@ define amdgpu_kernel void @private_system_seq_cst_acquire_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5749,6 +7592,12 @@ define amdgpu_kernel void @private_system_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -5769,6 +7618,12 @@ define amdgpu_kernel void @private_system_seq_cst_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5787,6 +7642,12 @@ define amdgpu_kernel void @private_system_seq_cst_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5803,6 +7664,12 @@ define amdgpu_kernel void @private_system_seq_cst_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_system_seq_cst_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -5820,6 +7687,12 @@ define amdgpu_kernel void @private_system_seq_cst_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_system_seq_cst_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -5837,6 +7710,12 @@ define amdgpu_kernel void @private_system_seq_cst_acquire_cmpxchg( ; GFX11-WGP-LABEL: private_system_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -5852,6 +7731,12 @@ define amdgpu_kernel void @private_system_seq_cst_acquire_cmpxchg( ; GFX11-CU-LABEL: private_system_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -5867,6 +7752,12 @@ define amdgpu_kernel void @private_system_seq_cst_acquire_cmpxchg( ; GFX12-WGP-LABEL: private_system_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -5881,6 +7772,12 @@ define amdgpu_kernel void @private_system_seq_cst_acquire_cmpxchg( ; GFX12-CU-LABEL: private_system_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -5895,26 +7792,32 @@ define amdgpu_kernel void @private_system_seq_cst_acquire_cmpxchg( ; GFX1250-LABEL: private_system_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -5922,6 +7825,7 @@ define amdgpu_kernel void @private_system_seq_cst_acquire_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -5949,6 +7853,12 @@ define amdgpu_kernel void @private_system_seq_cst_seq_cst_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -5969,6 +7879,12 @@ define amdgpu_kernel void @private_system_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -5989,6 +7905,12 @@ define amdgpu_kernel void @private_system_seq_cst_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -6006,6 +7928,12 @@ define amdgpu_kernel void @private_system_seq_cst_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6027,6 +7955,12 @@ define amdgpu_kernel void @private_system_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -6047,6 +7981,12 @@ define amdgpu_kernel void @private_system_seq_cst_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6065,6 +8005,12 @@ define amdgpu_kernel void @private_system_seq_cst_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6081,6 +8027,12 @@ define amdgpu_kernel void @private_system_seq_cst_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_system_seq_cst_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -6098,6 +8050,12 @@ define amdgpu_kernel void @private_system_seq_cst_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_system_seq_cst_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -6115,6 +8073,12 @@ define amdgpu_kernel void @private_system_seq_cst_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: private_system_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -6130,6 +8094,12 @@ define amdgpu_kernel void @private_system_seq_cst_seq_cst_cmpxchg( ; GFX11-CU-LABEL: private_system_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -6145,6 +8115,12 @@ define amdgpu_kernel void @private_system_seq_cst_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: private_system_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -6159,6 +8135,12 @@ define amdgpu_kernel void @private_system_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-LABEL: private_system_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -6173,26 +8155,32 @@ define amdgpu_kernel void @private_system_seq_cst_seq_cst_cmpxchg( ; GFX1250-LABEL: private_system_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -6200,6 +8188,7 @@ define amdgpu_kernel void @private_system_seq_cst_seq_cst_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -6228,6 +8217,12 @@ define amdgpu_kernel void @private_system_monotonic_monotonic_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -6252,6 +8247,12 @@ define amdgpu_kernel void @private_system_monotonic_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -6274,6 +8275,12 @@ define amdgpu_kernel void @private_system_monotonic_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -6293,6 +8300,12 @@ define amdgpu_kernel void @private_system_monotonic_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6317,6 +8330,12 @@ define amdgpu_kernel void @private_system_monotonic_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -6339,6 +8358,12 @@ define amdgpu_kernel void @private_system_monotonic_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6359,6 +8384,12 @@ define amdgpu_kernel void @private_system_monotonic_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6378,6 +8409,12 @@ define amdgpu_kernel void @private_system_monotonic_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -6398,6 +8435,12 @@ define amdgpu_kernel void @private_system_monotonic_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -6417,6 +8460,12 @@ define amdgpu_kernel void @private_system_monotonic_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: private_system_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -6433,6 +8482,12 @@ define amdgpu_kernel void @private_system_monotonic_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: private_system_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -6449,6 +8504,12 @@ define amdgpu_kernel void @private_system_monotonic_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: private_system_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -6464,6 +8525,12 @@ define amdgpu_kernel void @private_system_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: private_system_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -6480,25 +8547,31 @@ define amdgpu_kernel void @private_system_monotonic_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -6530,6 +8603,12 @@ define amdgpu_kernel void @private_system_acquire_monotonic_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -6554,6 +8633,12 @@ define amdgpu_kernel void @private_system_acquire_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -6576,6 +8661,12 @@ define amdgpu_kernel void @private_system_acquire_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -6595,6 +8686,12 @@ define amdgpu_kernel void @private_system_acquire_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6619,6 +8716,12 @@ define amdgpu_kernel void @private_system_acquire_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -6641,6 +8744,12 @@ define amdgpu_kernel void @private_system_acquire_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6661,6 +8770,12 @@ define amdgpu_kernel void @private_system_acquire_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6680,6 +8795,12 @@ define amdgpu_kernel void @private_system_acquire_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -6700,6 +8821,12 @@ define amdgpu_kernel void @private_system_acquire_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -6719,6 +8846,12 @@ define amdgpu_kernel void @private_system_acquire_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: private_system_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -6735,6 +8868,12 @@ define amdgpu_kernel void @private_system_acquire_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: private_system_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -6751,6 +8890,12 @@ define amdgpu_kernel void @private_system_acquire_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: private_system_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -6766,6 +8911,12 @@ define amdgpu_kernel void @private_system_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: private_system_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -6782,25 +8933,31 @@ define amdgpu_kernel void @private_system_acquire_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -6834,6 +8991,12 @@ define amdgpu_kernel void @private_system_acq_rel_monotonic_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -6858,6 +9021,12 @@ define amdgpu_kernel void @private_system_acq_rel_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -6880,6 +9049,12 @@ define amdgpu_kernel void @private_system_acq_rel_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -6899,6 +9074,12 @@ define amdgpu_kernel void @private_system_acq_rel_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6923,6 +9104,12 @@ define amdgpu_kernel void @private_system_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -6945,6 +9132,12 @@ define amdgpu_kernel void @private_system_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6965,6 +9158,12 @@ define amdgpu_kernel void @private_system_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6984,6 +9183,12 @@ define amdgpu_kernel void @private_system_acq_rel_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -7004,6 +9209,12 @@ define amdgpu_kernel void @private_system_acq_rel_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -7023,6 +9234,12 @@ define amdgpu_kernel void @private_system_acq_rel_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: private_system_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -7039,6 +9256,12 @@ define amdgpu_kernel void @private_system_acq_rel_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: private_system_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -7055,6 +9278,12 @@ define amdgpu_kernel void @private_system_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: private_system_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -7070,6 +9299,12 @@ define amdgpu_kernel void @private_system_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: private_system_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -7086,25 +9321,31 @@ define amdgpu_kernel void @private_system_acq_rel_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -7143,6 +9384,12 @@ define amdgpu_kernel void @private_system_seq_cst_monotonic_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -7167,6 +9414,12 @@ define amdgpu_kernel void @private_system_seq_cst_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -7189,6 +9442,12 @@ define amdgpu_kernel void @private_system_seq_cst_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7208,6 +9467,12 @@ define amdgpu_kernel void @private_system_seq_cst_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -7232,6 +9497,12 @@ define amdgpu_kernel void @private_system_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -7254,6 +9525,12 @@ define amdgpu_kernel void @private_system_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7274,6 +9551,12 @@ define amdgpu_kernel void @private_system_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7293,6 +9576,12 @@ define amdgpu_kernel void @private_system_seq_cst_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -7313,6 +9602,12 @@ define amdgpu_kernel void @private_system_seq_cst_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -7332,6 +9627,12 @@ define amdgpu_kernel void @private_system_seq_cst_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: private_system_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -7348,6 +9649,12 @@ define amdgpu_kernel void @private_system_seq_cst_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: private_system_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -7364,6 +9671,12 @@ define amdgpu_kernel void @private_system_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: private_system_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -7379,6 +9692,12 @@ define amdgpu_kernel void @private_system_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: private_system_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -7395,25 +9714,31 @@ define amdgpu_kernel void @private_system_seq_cst_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -7452,6 +9777,12 @@ define amdgpu_kernel void @private_system_monotonic_acquire_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -7476,6 +9807,12 @@ define amdgpu_kernel void @private_system_monotonic_acquire_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -7498,6 +9835,12 @@ define amdgpu_kernel void @private_system_monotonic_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7517,6 +9860,12 @@ define amdgpu_kernel void @private_system_monotonic_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -7541,6 +9890,12 @@ define amdgpu_kernel void @private_system_monotonic_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -7563,6 +9918,12 @@ define amdgpu_kernel void @private_system_monotonic_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7583,6 +9944,12 @@ define amdgpu_kernel void @private_system_monotonic_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7602,6 +9969,12 @@ define amdgpu_kernel void @private_system_monotonic_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -7622,6 +9995,12 @@ define amdgpu_kernel void @private_system_monotonic_acquire_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -7641,6 +10020,12 @@ define amdgpu_kernel void @private_system_monotonic_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: private_system_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -7657,6 +10042,12 @@ define amdgpu_kernel void @private_system_monotonic_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: private_system_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -7673,6 +10064,12 @@ define amdgpu_kernel void @private_system_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: private_system_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -7688,6 +10085,12 @@ define amdgpu_kernel void @private_system_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: private_system_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -7704,25 +10107,31 @@ define amdgpu_kernel void @private_system_monotonic_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -7756,6 +10165,12 @@ define amdgpu_kernel void @private_system_acquire_acquire_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -7780,6 +10195,12 @@ define amdgpu_kernel void @private_system_acquire_acquire_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -7802,6 +10223,12 @@ define amdgpu_kernel void @private_system_acquire_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7821,6 +10248,12 @@ define amdgpu_kernel void @private_system_acquire_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -7845,6 +10278,12 @@ define amdgpu_kernel void @private_system_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -7867,6 +10306,12 @@ define amdgpu_kernel void @private_system_acquire_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7887,6 +10332,12 @@ define amdgpu_kernel void @private_system_acquire_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7906,6 +10357,12 @@ define amdgpu_kernel void @private_system_acquire_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -7926,6 +10383,12 @@ define amdgpu_kernel void @private_system_acquire_acquire_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -7945,6 +10408,12 @@ define amdgpu_kernel void @private_system_acquire_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: private_system_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -7961,6 +10430,12 @@ define amdgpu_kernel void @private_system_acquire_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: private_system_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -7977,6 +10452,12 @@ define amdgpu_kernel void @private_system_acquire_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: private_system_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -7992,6 +10473,12 @@ define amdgpu_kernel void @private_system_acquire_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: private_system_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -8008,25 +10495,31 @@ define amdgpu_kernel void @private_system_acquire_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -8060,6 +10553,12 @@ define amdgpu_kernel void @private_system_release_acquire_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -8084,6 +10583,12 @@ define amdgpu_kernel void @private_system_release_acquire_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -8106,6 +10611,12 @@ define amdgpu_kernel void @private_system_release_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -8125,6 +10636,12 @@ define amdgpu_kernel void @private_system_release_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -8149,6 +10666,12 @@ define amdgpu_kernel void @private_system_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -8171,6 +10694,12 @@ define amdgpu_kernel void @private_system_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8191,6 +10720,12 @@ define amdgpu_kernel void @private_system_release_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8210,6 +10745,12 @@ define amdgpu_kernel void @private_system_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -8230,6 +10771,12 @@ define amdgpu_kernel void @private_system_release_acquire_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -8249,6 +10796,12 @@ define amdgpu_kernel void @private_system_release_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: private_system_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -8265,6 +10818,12 @@ define amdgpu_kernel void @private_system_release_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: private_system_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -8281,6 +10840,12 @@ define amdgpu_kernel void @private_system_release_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: private_system_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -8296,6 +10861,12 @@ define amdgpu_kernel void @private_system_release_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: private_system_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -8312,25 +10883,31 @@ define amdgpu_kernel void @private_system_release_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -8369,6 +10946,12 @@ define amdgpu_kernel void @private_system_acq_rel_acquire_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -8393,6 +10976,12 @@ define amdgpu_kernel void @private_system_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -8415,6 +11004,12 @@ define amdgpu_kernel void @private_system_acq_rel_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -8434,6 +11029,12 @@ define amdgpu_kernel void @private_system_acq_rel_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -8458,6 +11059,12 @@ define amdgpu_kernel void @private_system_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -8480,6 +11087,12 @@ define amdgpu_kernel void @private_system_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8500,6 +11113,12 @@ define amdgpu_kernel void @private_system_acq_rel_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8519,6 +11138,12 @@ define amdgpu_kernel void @private_system_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -8539,6 +11164,12 @@ define amdgpu_kernel void @private_system_acq_rel_acquire_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -8558,6 +11189,12 @@ define amdgpu_kernel void @private_system_acq_rel_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: private_system_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -8574,6 +11211,12 @@ define amdgpu_kernel void @private_system_acq_rel_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: private_system_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -8590,6 +11233,12 @@ define amdgpu_kernel void @private_system_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: private_system_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -8605,6 +11254,12 @@ define amdgpu_kernel void @private_system_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: private_system_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -8621,25 +11276,31 @@ define amdgpu_kernel void @private_system_acq_rel_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -8678,6 +11339,12 @@ define amdgpu_kernel void @private_system_seq_cst_acquire_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -8702,6 +11369,12 @@ define amdgpu_kernel void @private_system_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -8724,6 +11397,12 @@ define amdgpu_kernel void @private_system_seq_cst_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -8743,6 +11422,12 @@ define amdgpu_kernel void @private_system_seq_cst_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -8767,6 +11452,12 @@ define amdgpu_kernel void @private_system_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -8789,6 +11480,12 @@ define amdgpu_kernel void @private_system_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8809,6 +11506,12 @@ define amdgpu_kernel void @private_system_seq_cst_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8828,6 +11531,12 @@ define amdgpu_kernel void @private_system_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -8848,6 +11557,12 @@ define amdgpu_kernel void @private_system_seq_cst_acquire_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -8867,6 +11582,12 @@ define amdgpu_kernel void @private_system_seq_cst_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: private_system_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -8883,6 +11604,12 @@ define amdgpu_kernel void @private_system_seq_cst_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: private_system_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -8899,6 +11626,12 @@ define amdgpu_kernel void @private_system_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: private_system_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -8914,6 +11647,12 @@ define amdgpu_kernel void @private_system_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: private_system_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -8930,25 +11669,31 @@ define amdgpu_kernel void @private_system_seq_cst_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -8987,6 +11732,12 @@ define amdgpu_kernel void @private_system_monotonic_seq_cst_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -9011,6 +11762,12 @@ define amdgpu_kernel void @private_system_monotonic_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -9033,6 +11790,12 @@ define amdgpu_kernel void @private_system_monotonic_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -9052,6 +11815,12 @@ define amdgpu_kernel void @private_system_monotonic_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -9076,6 +11845,12 @@ define amdgpu_kernel void @private_system_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -9098,6 +11873,12 @@ define amdgpu_kernel void @private_system_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9118,6 +11899,12 @@ define amdgpu_kernel void @private_system_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9137,6 +11924,12 @@ define amdgpu_kernel void @private_system_monotonic_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -9157,6 +11950,12 @@ define amdgpu_kernel void @private_system_monotonic_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -9176,6 +11975,12 @@ define amdgpu_kernel void @private_system_monotonic_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: private_system_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -9192,6 +11997,12 @@ define amdgpu_kernel void @private_system_monotonic_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: private_system_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -9208,6 +12019,12 @@ define amdgpu_kernel void @private_system_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: private_system_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -9223,6 +12040,12 @@ define amdgpu_kernel void @private_system_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: private_system_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -9239,25 +12062,31 @@ define amdgpu_kernel void @private_system_monotonic_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -9296,6 +12125,12 @@ define amdgpu_kernel void @private_system_acquire_seq_cst_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -9320,6 +12155,12 @@ define amdgpu_kernel void @private_system_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -9342,6 +12183,12 @@ define amdgpu_kernel void @private_system_acquire_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -9361,6 +12208,12 @@ define amdgpu_kernel void @private_system_acquire_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -9385,6 +12238,12 @@ define amdgpu_kernel void @private_system_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -9407,6 +12266,12 @@ define amdgpu_kernel void @private_system_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9427,6 +12292,12 @@ define amdgpu_kernel void @private_system_acquire_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9446,6 +12317,12 @@ define amdgpu_kernel void @private_system_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -9466,6 +12343,12 @@ define amdgpu_kernel void @private_system_acquire_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -9485,6 +12368,12 @@ define amdgpu_kernel void @private_system_acquire_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: private_system_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -9501,6 +12390,12 @@ define amdgpu_kernel void @private_system_acquire_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: private_system_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -9517,6 +12412,12 @@ define amdgpu_kernel void @private_system_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: private_system_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -9532,6 +12433,12 @@ define amdgpu_kernel void @private_system_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: private_system_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -9548,25 +12455,31 @@ define amdgpu_kernel void @private_system_acquire_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -9605,6 +12518,12 @@ define amdgpu_kernel void @private_system_relese_seq_cst_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -9629,6 +12548,12 @@ define amdgpu_kernel void @private_system_relese_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -9651,6 +12576,12 @@ define amdgpu_kernel void @private_system_relese_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -9670,6 +12601,12 @@ define amdgpu_kernel void @private_system_relese_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -9694,6 +12631,12 @@ define amdgpu_kernel void @private_system_relese_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -9716,6 +12659,12 @@ define amdgpu_kernel void @private_system_relese_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9736,6 +12685,12 @@ define amdgpu_kernel void @private_system_relese_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9755,6 +12710,12 @@ define amdgpu_kernel void @private_system_relese_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -9775,6 +12736,12 @@ define amdgpu_kernel void @private_system_relese_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -9794,6 +12761,12 @@ define amdgpu_kernel void @private_system_relese_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: private_system_relese_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -9810,6 +12783,12 @@ define amdgpu_kernel void @private_system_relese_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: private_system_relese_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -9826,6 +12805,12 @@ define amdgpu_kernel void @private_system_relese_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: private_system_relese_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -9841,6 +12826,12 @@ define amdgpu_kernel void @private_system_relese_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: private_system_relese_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -9857,25 +12848,31 @@ define amdgpu_kernel void @private_system_relese_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -9914,6 +12911,12 @@ define amdgpu_kernel void @private_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -9938,6 +12941,12 @@ define amdgpu_kernel void @private_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -9960,6 +12969,12 @@ define amdgpu_kernel void @private_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -9979,6 +12994,12 @@ define amdgpu_kernel void @private_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10003,6 +13024,12 @@ define amdgpu_kernel void @private_system_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -10025,6 +13052,12 @@ define amdgpu_kernel void @private_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10045,6 +13078,12 @@ define amdgpu_kernel void @private_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10064,6 +13103,12 @@ define amdgpu_kernel void @private_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -10084,6 +13129,12 @@ define amdgpu_kernel void @private_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -10103,6 +13154,12 @@ define amdgpu_kernel void @private_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: private_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -10119,6 +13176,12 @@ define amdgpu_kernel void @private_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: private_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -10135,6 +13198,12 @@ define amdgpu_kernel void @private_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: private_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -10150,6 +13219,12 @@ define amdgpu_kernel void @private_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: private_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -10166,25 +13241,31 @@ define amdgpu_kernel void @private_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -10223,6 +13304,12 @@ define amdgpu_kernel void @private_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -10247,6 +13334,12 @@ define amdgpu_kernel void @private_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -10269,6 +13362,12 @@ define amdgpu_kernel void @private_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10288,6 +13387,12 @@ define amdgpu_kernel void @private_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10312,6 +13417,12 @@ define amdgpu_kernel void @private_system_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -10334,6 +13445,12 @@ define amdgpu_kernel void @private_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10354,6 +13471,12 @@ define amdgpu_kernel void @private_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10373,6 +13496,12 @@ define amdgpu_kernel void @private_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -10393,6 +13522,12 @@ define amdgpu_kernel void @private_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -10412,6 +13547,12 @@ define amdgpu_kernel void @private_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: private_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -10428,6 +13569,12 @@ define amdgpu_kernel void @private_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: private_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -10444,6 +13591,12 @@ define amdgpu_kernel void @private_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: private_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -10459,6 +13612,12 @@ define amdgpu_kernel void @private_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: private_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -10475,25 +13634,31 @@ define amdgpu_kernel void @private_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -10530,11 +13695,15 @@ define amdgpu_kernel void @private_system_one_as_unordered_load( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -10544,11 +13713,15 @@ define amdgpu_kernel void @private_system_one_as_unordered_load( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -10558,11 +13731,15 @@ define amdgpu_kernel void @private_system_one_as_unordered_load( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -10572,11 +13749,15 @@ define amdgpu_kernel void @private_system_one_as_unordered_load( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -10590,11 +13771,15 @@ define amdgpu_kernel void @private_system_one_as_unordered_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -10604,11 +13789,15 @@ define amdgpu_kernel void @private_system_one_as_unordered_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -10618,11 +13807,15 @@ define amdgpu_kernel void @private_system_one_as_unordered_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -10630,84 +13823,108 @@ define amdgpu_kernel void @private_system_one_as_unordered_load( ; ; GFX942-NOTTGSPLIT-LABEL: private_system_one_as_unordered_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_system_one_as_unordered_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_system_one_as_unordered_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_system_one_as_unordered_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_system_one_as_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_system_one_as_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: private_system_one_as_unordered_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -10716,6 +13933,7 @@ define amdgpu_kernel void @private_system_one_as_unordered_load( ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: flat_load_b32 v0, v[0:1] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %in, ptr addrspace(5) %out) { @@ -10730,11 +13948,15 @@ define amdgpu_kernel void @private_system_one_as_monotonic_load( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -10744,11 +13966,15 @@ define amdgpu_kernel void @private_system_one_as_monotonic_load( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -10758,11 +13984,15 @@ define amdgpu_kernel void @private_system_one_as_monotonic_load( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -10772,11 +14002,15 @@ define amdgpu_kernel void @private_system_one_as_monotonic_load( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -10790,11 +14024,15 @@ define amdgpu_kernel void @private_system_one_as_monotonic_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -10804,11 +14042,15 @@ define amdgpu_kernel void @private_system_one_as_monotonic_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -10818,11 +14060,15 @@ define amdgpu_kernel void @private_system_one_as_monotonic_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -10830,84 +14076,108 @@ define amdgpu_kernel void @private_system_one_as_monotonic_load( ; ; GFX942-NOTTGSPLIT-LABEL: private_system_one_as_monotonic_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_system_one_as_monotonic_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_system_one_as_monotonic_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_system_one_as_monotonic_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_system_one_as_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_system_one_as_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: private_system_one_as_monotonic_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -10916,6 +14186,7 @@ define amdgpu_kernel void @private_system_one_as_monotonic_load( ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %in, ptr addrspace(5) %out) { @@ -10930,11 +14201,15 @@ define amdgpu_kernel void @private_system_one_as_acquire_load( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -10944,11 +14219,15 @@ define amdgpu_kernel void @private_system_one_as_acquire_load( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -10958,11 +14237,15 @@ define amdgpu_kernel void @private_system_one_as_acquire_load( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -10972,11 +14255,15 @@ define amdgpu_kernel void @private_system_one_as_acquire_load( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -10990,11 +14277,15 @@ define amdgpu_kernel void @private_system_one_as_acquire_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -11004,11 +14295,15 @@ define amdgpu_kernel void @private_system_one_as_acquire_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -11018,11 +14313,15 @@ define amdgpu_kernel void @private_system_one_as_acquire_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -11030,84 +14329,108 @@ define amdgpu_kernel void @private_system_one_as_acquire_load( ; ; GFX942-NOTTGSPLIT-LABEL: private_system_one_as_acquire_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_system_one_as_acquire_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_system_one_as_acquire_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_system_one_as_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_system_one_as_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_system_one_as_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: private_system_one_as_acquire_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -11118,6 +14441,7 @@ define amdgpu_kernel void @private_system_one_as_acquire_load( ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %in, ptr addrspace(5) %out) { @@ -11132,11 +14456,15 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_load( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -11146,11 +14474,15 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_load( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -11160,11 +14492,15 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_load( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -11174,11 +14510,15 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_load( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -11192,11 +14532,15 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -11206,11 +14550,15 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -11220,11 +14568,15 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -11232,84 +14584,108 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_load( ; ; GFX942-NOTTGSPLIT-LABEL: private_system_one_as_seq_cst_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_system_one_as_seq_cst_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_system_one_as_seq_cst_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_system_one_as_seq_cst_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_system_one_as_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_system_one_as_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: private_system_one_as_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -11322,6 +14698,7 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_load( ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %in, ptr addrspace(5) %out) { @@ -11336,10 +14713,14 @@ define amdgpu_kernel void @private_system_one_as_unordered_store( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX6-NEXT: s_endpgm @@ -11348,10 +14729,14 @@ define amdgpu_kernel void @private_system_one_as_unordered_store( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX7-NEXT: s_endpgm @@ -11360,10 +14745,14 @@ define amdgpu_kernel void @private_system_one_as_unordered_store( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-WGP-NEXT: s_endpgm @@ -11372,10 +14761,14 @@ define amdgpu_kernel void @private_system_one_as_unordered_store( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-CU-NEXT: s_endpgm @@ -11388,10 +14781,14 @@ define amdgpu_kernel void @private_system_one_as_unordered_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -11400,10 +14797,14 @@ define amdgpu_kernel void @private_system_one_as_unordered_store( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -11412,65 +14813,93 @@ define amdgpu_kernel void @private_system_one_as_unordered_store( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: private_system_one_as_unordered_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_system_one_as_unordered_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_system_one_as_unordered_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_system_one_as_unordered_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_system_one_as_unordered_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_system_one_as_unordered_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; @@ -11478,22 +14907,26 @@ define amdgpu_kernel void @private_system_one_as_unordered_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -11515,10 +14948,14 @@ define amdgpu_kernel void @private_system_one_as_monotonic_store( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX6-NEXT: s_endpgm @@ -11527,10 +14964,14 @@ define amdgpu_kernel void @private_system_one_as_monotonic_store( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX7-NEXT: s_endpgm @@ -11539,10 +14980,14 @@ define amdgpu_kernel void @private_system_one_as_monotonic_store( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-WGP-NEXT: s_endpgm @@ -11551,10 +14996,14 @@ define amdgpu_kernel void @private_system_one_as_monotonic_store( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-CU-NEXT: s_endpgm @@ -11567,10 +15016,14 @@ define amdgpu_kernel void @private_system_one_as_monotonic_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -11579,10 +15032,14 @@ define amdgpu_kernel void @private_system_one_as_monotonic_store( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -11591,65 +15048,93 @@ define amdgpu_kernel void @private_system_one_as_monotonic_store( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: private_system_one_as_monotonic_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_system_one_as_monotonic_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_system_one_as_monotonic_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_system_one_as_monotonic_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_system_one_as_monotonic_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_system_one_as_monotonic_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; @@ -11657,22 +15142,26 @@ define amdgpu_kernel void @private_system_one_as_monotonic_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -11694,10 +15183,14 @@ define amdgpu_kernel void @private_system_one_as_release_store( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX6-NEXT: s_endpgm @@ -11706,10 +15199,14 @@ define amdgpu_kernel void @private_system_one_as_release_store( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX7-NEXT: s_endpgm @@ -11718,10 +15215,14 @@ define amdgpu_kernel void @private_system_one_as_release_store( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-WGP-NEXT: s_endpgm @@ -11730,10 +15231,14 @@ define amdgpu_kernel void @private_system_one_as_release_store( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-CU-NEXT: s_endpgm @@ -11746,10 +15251,14 @@ define amdgpu_kernel void @private_system_one_as_release_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -11758,10 +15267,14 @@ define amdgpu_kernel void @private_system_one_as_release_store( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -11770,65 +15283,93 @@ define amdgpu_kernel void @private_system_one_as_release_store( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: private_system_one_as_release_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_system_one_as_release_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_system_one_as_release_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_system_one_as_release_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_system_one_as_release_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_system_one_as_release_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; @@ -11836,22 +15377,26 @@ define amdgpu_kernel void @private_system_one_as_release_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -11878,10 +15423,14 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_store( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX6-NEXT: s_endpgm @@ -11890,10 +15439,14 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_store( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX7-NEXT: s_endpgm @@ -11902,10 +15455,14 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_store( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-WGP-NEXT: s_endpgm @@ -11914,10 +15471,14 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_store( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-CU-NEXT: s_endpgm @@ -11930,10 +15491,14 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -11942,10 +15507,14 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_store( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -11954,65 +15523,93 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_store( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: private_system_one_as_seq_cst_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_system_one_as_seq_cst_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_system_one_as_seq_cst_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_system_one_as_seq_cst_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_system_one_as_seq_cst_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_system_one_as_seq_cst_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; @@ -12020,22 +15617,26 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -12063,8 +15664,15 @@ define amdgpu_kernel void @private_system_one_as_monotonic_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12075,8 +15683,15 @@ define amdgpu_kernel void @private_system_one_as_monotonic_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12087,8 +15702,15 @@ define amdgpu_kernel void @private_system_one_as_monotonic_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12099,8 +15721,15 @@ define amdgpu_kernel void @private_system_one_as_monotonic_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12115,8 +15744,15 @@ define amdgpu_kernel void @private_system_one_as_monotonic_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -12127,8 +15763,15 @@ define amdgpu_kernel void @private_system_one_as_monotonic_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12139,8 +15782,15 @@ define amdgpu_kernel void @private_system_one_as_monotonic_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12149,8 +15799,14 @@ define amdgpu_kernel void @private_system_one_as_monotonic_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_system_one_as_monotonic_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -12158,8 +15814,14 @@ define amdgpu_kernel void @private_system_one_as_monotonic_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_system_one_as_monotonic_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm @@ -12167,8 +15829,14 @@ define amdgpu_kernel void @private_system_one_as_monotonic_atomicrmw( ; GFX11-WGP-LABEL: private_system_one_as_monotonic_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm @@ -12176,8 +15844,14 @@ define amdgpu_kernel void @private_system_one_as_monotonic_atomicrmw( ; GFX11-CU-LABEL: private_system_one_as_monotonic_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm @@ -12185,8 +15859,14 @@ define amdgpu_kernel void @private_system_one_as_monotonic_atomicrmw( ; GFX12-WGP-LABEL: private_system_one_as_monotonic_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm @@ -12194,8 +15874,14 @@ define amdgpu_kernel void @private_system_one_as_monotonic_atomicrmw( ; GFX12-CU-LABEL: private_system_one_as_monotonic_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm @@ -12203,29 +15889,34 @@ define amdgpu_kernel void @private_system_one_as_monotonic_atomicrmw( ; GFX1250-LABEL: private_system_one_as_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 ; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS @@ -12242,8 +15933,15 @@ define amdgpu_kernel void @private_system_one_as_acquire_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12254,8 +15952,15 @@ define amdgpu_kernel void @private_system_one_as_acquire_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12266,8 +15971,15 @@ define amdgpu_kernel void @private_system_one_as_acquire_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12278,8 +15990,15 @@ define amdgpu_kernel void @private_system_one_as_acquire_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12294,8 +16013,15 @@ define amdgpu_kernel void @private_system_one_as_acquire_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -12306,8 +16032,15 @@ define amdgpu_kernel void @private_system_one_as_acquire_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12318,8 +16051,15 @@ define amdgpu_kernel void @private_system_one_as_acquire_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12328,8 +16068,14 @@ define amdgpu_kernel void @private_system_one_as_acquire_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_system_one_as_acquire_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -12337,8 +16083,14 @@ define amdgpu_kernel void @private_system_one_as_acquire_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_system_one_as_acquire_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm @@ -12346,8 +16098,14 @@ define amdgpu_kernel void @private_system_one_as_acquire_atomicrmw( ; GFX11-WGP-LABEL: private_system_one_as_acquire_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm @@ -12355,8 +16113,14 @@ define amdgpu_kernel void @private_system_one_as_acquire_atomicrmw( ; GFX11-CU-LABEL: private_system_one_as_acquire_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm @@ -12364,8 +16128,14 @@ define amdgpu_kernel void @private_system_one_as_acquire_atomicrmw( ; GFX12-WGP-LABEL: private_system_one_as_acquire_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm @@ -12373,8 +16143,14 @@ define amdgpu_kernel void @private_system_one_as_acquire_atomicrmw( ; GFX12-CU-LABEL: private_system_one_as_acquire_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm @@ -12382,29 +16158,34 @@ define amdgpu_kernel void @private_system_one_as_acquire_atomicrmw( ; GFX1250-LABEL: private_system_one_as_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 ; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS @@ -12424,8 +16205,15 @@ define amdgpu_kernel void @private_system_one_as_release_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12436,8 +16224,15 @@ define amdgpu_kernel void @private_system_one_as_release_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12448,8 +16243,15 @@ define amdgpu_kernel void @private_system_one_as_release_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12460,8 +16262,15 @@ define amdgpu_kernel void @private_system_one_as_release_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12476,8 +16285,15 @@ define amdgpu_kernel void @private_system_one_as_release_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -12488,8 +16304,15 @@ define amdgpu_kernel void @private_system_one_as_release_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12500,8 +16323,15 @@ define amdgpu_kernel void @private_system_one_as_release_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12510,8 +16340,14 @@ define amdgpu_kernel void @private_system_one_as_release_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_system_one_as_release_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -12519,8 +16355,14 @@ define amdgpu_kernel void @private_system_one_as_release_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_system_one_as_release_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm @@ -12528,8 +16370,14 @@ define amdgpu_kernel void @private_system_one_as_release_atomicrmw( ; GFX11-WGP-LABEL: private_system_one_as_release_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm @@ -12537,8 +16385,14 @@ define amdgpu_kernel void @private_system_one_as_release_atomicrmw( ; GFX11-CU-LABEL: private_system_one_as_release_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm @@ -12546,8 +16400,14 @@ define amdgpu_kernel void @private_system_one_as_release_atomicrmw( ; GFX12-WGP-LABEL: private_system_one_as_release_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm @@ -12555,8 +16415,14 @@ define amdgpu_kernel void @private_system_one_as_release_atomicrmw( ; GFX12-CU-LABEL: private_system_one_as_release_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm @@ -12564,29 +16430,34 @@ define amdgpu_kernel void @private_system_one_as_release_atomicrmw( ; GFX1250-LABEL: private_system_one_as_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 ; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -12608,8 +16479,15 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12620,8 +16498,15 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12632,8 +16517,15 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12644,8 +16536,15 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12660,8 +16559,15 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -12672,8 +16578,15 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12684,8 +16597,15 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12694,8 +16614,14 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_system_one_as_acq_rel_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -12703,8 +16629,14 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_system_one_as_acq_rel_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm @@ -12712,8 +16644,14 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_atomicrmw( ; GFX11-WGP-LABEL: private_system_one_as_acq_rel_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm @@ -12721,8 +16659,14 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_atomicrmw( ; GFX11-CU-LABEL: private_system_one_as_acq_rel_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm @@ -12730,8 +16674,14 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_atomicrmw( ; GFX12-WGP-LABEL: private_system_one_as_acq_rel_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm @@ -12739,8 +16689,14 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_atomicrmw( ; GFX12-CU-LABEL: private_system_one_as_acq_rel_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm @@ -12748,29 +16704,34 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_atomicrmw( ; GFX1250-LABEL: private_system_one_as_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 ; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -12795,8 +16756,15 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12807,8 +16775,15 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12819,8 +16794,15 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12831,8 +16813,15 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12847,8 +16836,15 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -12859,8 +16855,15 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12871,8 +16874,15 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12881,8 +16891,14 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_system_one_as_seq_cst_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -12890,8 +16906,14 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_system_one_as_seq_cst_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm @@ -12899,8 +16921,14 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_atomicrmw( ; GFX11-WGP-LABEL: private_system_one_as_seq_cst_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm @@ -12908,8 +16936,14 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_atomicrmw( ; GFX11-CU-LABEL: private_system_one_as_seq_cst_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm @@ -12917,8 +16951,14 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_atomicrmw( ; GFX12-WGP-LABEL: private_system_one_as_seq_cst_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm @@ -12926,8 +16966,14 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_atomicrmw( ; GFX12-CU-LABEL: private_system_one_as_seq_cst_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm @@ -12935,29 +16981,34 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_atomicrmw( ; GFX1250-LABEL: private_system_one_as_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 ; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -12982,6 +17033,10 @@ define amdgpu_kernel void @private_system_one_as_acquire_ret_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -13000,6 +17055,10 @@ define amdgpu_kernel void @private_system_one_as_acquire_ret_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -13017,6 +17076,10 @@ define amdgpu_kernel void @private_system_one_as_acquire_ret_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -13034,6 +17097,10 @@ define amdgpu_kernel void @private_system_one_as_acquire_ret_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -13055,6 +17122,10 @@ define amdgpu_kernel void @private_system_one_as_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -13072,6 +17143,10 @@ define amdgpu_kernel void @private_system_one_as_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -13089,6 +17164,10 @@ define amdgpu_kernel void @private_system_one_as_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -13104,6 +17183,10 @@ define amdgpu_kernel void @private_system_one_as_acquire_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_system_one_as_acquire_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 @@ -13116,6 +17199,10 @@ define amdgpu_kernel void @private_system_one_as_acquire_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_system_one_as_acquire_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 @@ -13128,6 +17215,10 @@ define amdgpu_kernel void @private_system_one_as_acquire_ret_atomicrmw( ; GFX11-WGP-LABEL: private_system_one_as_acquire_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 @@ -13140,6 +17231,10 @@ define amdgpu_kernel void @private_system_one_as_acquire_ret_atomicrmw( ; GFX11-CU-LABEL: private_system_one_as_acquire_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 @@ -13152,6 +17247,10 @@ define amdgpu_kernel void @private_system_one_as_acquire_ret_atomicrmw( ; GFX12-WGP-LABEL: private_system_one_as_acquire_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 @@ -13164,6 +17263,10 @@ define amdgpu_kernel void @private_system_one_as_acquire_ret_atomicrmw( ; GFX12-CU-LABEL: private_system_one_as_acquire_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 @@ -13177,22 +17280,26 @@ define amdgpu_kernel void @private_system_one_as_acquire_ret_atomicrmw( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_lg_u32 s0, s2 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b32 s2, 0 ; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s0, s3 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -13220,6 +17327,10 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_ret_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -13238,6 +17349,10 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_ret_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -13255,6 +17370,10 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_ret_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -13272,6 +17391,10 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_ret_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -13293,6 +17416,10 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -13310,6 +17437,10 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -13327,6 +17458,10 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -13342,6 +17477,10 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_system_one_as_acq_rel_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 @@ -13354,6 +17493,10 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_system_one_as_acq_rel_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 @@ -13366,6 +17509,10 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_ret_atomicrmw( ; GFX11-WGP-LABEL: private_system_one_as_acq_rel_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 @@ -13378,6 +17525,10 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_ret_atomicrmw( ; GFX11-CU-LABEL: private_system_one_as_acq_rel_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 @@ -13390,6 +17541,10 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_ret_atomicrmw( ; GFX12-WGP-LABEL: private_system_one_as_acq_rel_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 @@ -13402,6 +17557,10 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-LABEL: private_system_one_as_acq_rel_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 @@ -13415,22 +17574,26 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_ret_atomicrmw( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_lg_u32 s0, s2 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b32 s2, 0 ; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s0, s3 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -13463,6 +17626,10 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_ret_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -13481,6 +17648,10 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_ret_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -13498,6 +17669,10 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_ret_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -13515,6 +17690,10 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_ret_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -13536,6 +17715,10 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -13553,6 +17736,10 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -13570,6 +17757,10 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -13585,6 +17776,10 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_system_one_as_seq_cst_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 @@ -13597,6 +17792,10 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_system_one_as_seq_cst_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 @@ -13609,6 +17808,10 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_ret_atomicrmw( ; GFX11-WGP-LABEL: private_system_one_as_seq_cst_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 @@ -13621,6 +17824,10 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_ret_atomicrmw( ; GFX11-CU-LABEL: private_system_one_as_seq_cst_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 @@ -13633,6 +17840,10 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_ret_atomicrmw( ; GFX12-WGP-LABEL: private_system_one_as_seq_cst_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 @@ -13645,6 +17856,10 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-LABEL: private_system_one_as_seq_cst_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 @@ -13658,22 +17873,26 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_ret_atomicrmw( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_lg_u32 s0, s2 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b32 s2, 0 ; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s0, s3 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -13706,6 +17925,12 @@ define amdgpu_kernel void @private_system_one_as_monotonic_monotonic_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -13726,6 +17951,12 @@ define amdgpu_kernel void @private_system_one_as_monotonic_monotonic_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -13746,6 +17977,12 @@ define amdgpu_kernel void @private_system_one_as_monotonic_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -13763,6 +18000,12 @@ define amdgpu_kernel void @private_system_one_as_monotonic_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -13784,6 +18027,12 @@ define amdgpu_kernel void @private_system_one_as_monotonic_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -13804,6 +18053,12 @@ define amdgpu_kernel void @private_system_one_as_monotonic_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13822,6 +18077,12 @@ define amdgpu_kernel void @private_system_one_as_monotonic_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13838,6 +18099,12 @@ define amdgpu_kernel void @private_system_one_as_monotonic_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_system_one_as_monotonic_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -13855,6 +18122,12 @@ define amdgpu_kernel void @private_system_one_as_monotonic_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_system_one_as_monotonic_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -13872,6 +18145,12 @@ define amdgpu_kernel void @private_system_one_as_monotonic_monotonic_cmpxchg( ; GFX11-WGP-LABEL: private_system_one_as_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -13887,6 +18166,12 @@ define amdgpu_kernel void @private_system_one_as_monotonic_monotonic_cmpxchg( ; GFX11-CU-LABEL: private_system_one_as_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -13902,6 +18187,12 @@ define amdgpu_kernel void @private_system_one_as_monotonic_monotonic_cmpxchg( ; GFX12-WGP-LABEL: private_system_one_as_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -13916,6 +18207,12 @@ define amdgpu_kernel void @private_system_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-LABEL: private_system_one_as_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -13930,26 +18227,32 @@ define amdgpu_kernel void @private_system_one_as_monotonic_monotonic_cmpxchg( ; GFX1250-LABEL: private_system_one_as_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -13957,6 +18260,7 @@ define amdgpu_kernel void @private_system_one_as_monotonic_monotonic_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -13976,6 +18280,12 @@ define amdgpu_kernel void @private_system_one_as_acquire_monotonic_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -13996,6 +18306,12 @@ define amdgpu_kernel void @private_system_one_as_acquire_monotonic_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -14016,6 +18332,12 @@ define amdgpu_kernel void @private_system_one_as_acquire_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -14033,6 +18355,12 @@ define amdgpu_kernel void @private_system_one_as_acquire_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -14054,6 +18382,12 @@ define amdgpu_kernel void @private_system_one_as_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -14074,6 +18408,12 @@ define amdgpu_kernel void @private_system_one_as_acquire_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14092,6 +18432,12 @@ define amdgpu_kernel void @private_system_one_as_acquire_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14108,6 +18454,12 @@ define amdgpu_kernel void @private_system_one_as_acquire_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_system_one_as_acquire_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -14125,6 +18477,12 @@ define amdgpu_kernel void @private_system_one_as_acquire_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_system_one_as_acquire_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -14142,6 +18500,12 @@ define amdgpu_kernel void @private_system_one_as_acquire_monotonic_cmpxchg( ; GFX11-WGP-LABEL: private_system_one_as_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -14157,6 +18521,12 @@ define amdgpu_kernel void @private_system_one_as_acquire_monotonic_cmpxchg( ; GFX11-CU-LABEL: private_system_one_as_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -14172,6 +18542,12 @@ define amdgpu_kernel void @private_system_one_as_acquire_monotonic_cmpxchg( ; GFX12-WGP-LABEL: private_system_one_as_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -14186,6 +18562,12 @@ define amdgpu_kernel void @private_system_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-LABEL: private_system_one_as_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -14200,26 +18582,32 @@ define amdgpu_kernel void @private_system_one_as_acquire_monotonic_cmpxchg( ; GFX1250-LABEL: private_system_one_as_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -14227,6 +18615,7 @@ define amdgpu_kernel void @private_system_one_as_acquire_monotonic_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -14249,6 +18638,12 @@ define amdgpu_kernel void @private_system_one_as_release_monotonic_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -14269,6 +18664,12 @@ define amdgpu_kernel void @private_system_one_as_release_monotonic_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -14289,6 +18690,12 @@ define amdgpu_kernel void @private_system_one_as_release_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -14306,6 +18713,12 @@ define amdgpu_kernel void @private_system_one_as_release_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -14327,6 +18740,12 @@ define amdgpu_kernel void @private_system_one_as_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -14347,6 +18766,12 @@ define amdgpu_kernel void @private_system_one_as_release_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14365,6 +18790,12 @@ define amdgpu_kernel void @private_system_one_as_release_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14381,6 +18812,12 @@ define amdgpu_kernel void @private_system_one_as_release_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_system_one_as_release_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -14398,6 +18835,12 @@ define amdgpu_kernel void @private_system_one_as_release_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_system_one_as_release_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -14415,6 +18858,12 @@ define amdgpu_kernel void @private_system_one_as_release_monotonic_cmpxchg( ; GFX11-WGP-LABEL: private_system_one_as_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -14430,6 +18879,12 @@ define amdgpu_kernel void @private_system_one_as_release_monotonic_cmpxchg( ; GFX11-CU-LABEL: private_system_one_as_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -14445,6 +18900,12 @@ define amdgpu_kernel void @private_system_one_as_release_monotonic_cmpxchg( ; GFX12-WGP-LABEL: private_system_one_as_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -14459,6 +18920,12 @@ define amdgpu_kernel void @private_system_one_as_release_monotonic_cmpxchg( ; GFX12-CU-LABEL: private_system_one_as_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -14473,26 +18940,32 @@ define amdgpu_kernel void @private_system_one_as_release_monotonic_cmpxchg( ; GFX1250-LABEL: private_system_one_as_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -14500,6 +18973,7 @@ define amdgpu_kernel void @private_system_one_as_release_monotonic_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -14524,6 +18998,12 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -14544,6 +19024,12 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -14564,6 +19050,12 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -14581,6 +19073,12 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -14602,6 +19100,12 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -14622,6 +19126,12 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14640,6 +19150,12 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14656,6 +19172,12 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -14673,6 +19195,12 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -14690,6 +19218,12 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX11-WGP-LABEL: private_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -14705,6 +19239,12 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX11-CU-LABEL: private_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -14720,6 +19260,12 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-WGP-LABEL: private_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -14734,6 +19280,12 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-LABEL: private_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -14748,26 +19300,32 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX1250-LABEL: private_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -14775,6 +19333,7 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -14802,6 +19361,12 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -14822,6 +19387,12 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -14842,6 +19413,12 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -14859,6 +19436,12 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -14880,6 +19463,12 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -14900,6 +19489,12 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14918,6 +19513,12 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14934,6 +19535,12 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -14951,6 +19558,12 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -14968,6 +19581,12 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX11-WGP-LABEL: private_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -14983,6 +19602,12 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX11-CU-LABEL: private_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -14998,6 +19623,12 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-WGP-LABEL: private_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -15012,6 +19643,12 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-LABEL: private_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -15026,26 +19663,32 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX1250-LABEL: private_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -15053,6 +19696,7 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -15080,6 +19724,12 @@ define amdgpu_kernel void @private_system_one_as_monotonic_acquire_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -15100,6 +19750,12 @@ define amdgpu_kernel void @private_system_one_as_monotonic_acquire_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -15120,6 +19776,12 @@ define amdgpu_kernel void @private_system_one_as_monotonic_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -15137,6 +19799,12 @@ define amdgpu_kernel void @private_system_one_as_monotonic_acquire_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -15158,6 +19826,12 @@ define amdgpu_kernel void @private_system_one_as_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -15178,6 +19852,12 @@ define amdgpu_kernel void @private_system_one_as_monotonic_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15196,6 +19876,12 @@ define amdgpu_kernel void @private_system_one_as_monotonic_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15212,6 +19898,12 @@ define amdgpu_kernel void @private_system_one_as_monotonic_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_system_one_as_monotonic_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -15229,6 +19921,12 @@ define amdgpu_kernel void @private_system_one_as_monotonic_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_system_one_as_monotonic_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -15246,6 +19944,12 @@ define amdgpu_kernel void @private_system_one_as_monotonic_acquire_cmpxchg( ; GFX11-WGP-LABEL: private_system_one_as_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -15261,6 +19965,12 @@ define amdgpu_kernel void @private_system_one_as_monotonic_acquire_cmpxchg( ; GFX11-CU-LABEL: private_system_one_as_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -15276,6 +19986,12 @@ define amdgpu_kernel void @private_system_one_as_monotonic_acquire_cmpxchg( ; GFX12-WGP-LABEL: private_system_one_as_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -15290,6 +20006,12 @@ define amdgpu_kernel void @private_system_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-LABEL: private_system_one_as_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -15304,26 +20026,32 @@ define amdgpu_kernel void @private_system_one_as_monotonic_acquire_cmpxchg( ; GFX1250-LABEL: private_system_one_as_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -15331,6 +20059,7 @@ define amdgpu_kernel void @private_system_one_as_monotonic_acquire_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -15353,6 +20082,12 @@ define amdgpu_kernel void @private_system_one_as_acquire_acquire_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -15373,6 +20108,12 @@ define amdgpu_kernel void @private_system_one_as_acquire_acquire_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -15393,6 +20134,12 @@ define amdgpu_kernel void @private_system_one_as_acquire_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -15410,6 +20157,12 @@ define amdgpu_kernel void @private_system_one_as_acquire_acquire_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -15431,6 +20184,12 @@ define amdgpu_kernel void @private_system_one_as_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -15451,6 +20210,12 @@ define amdgpu_kernel void @private_system_one_as_acquire_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15469,6 +20234,12 @@ define amdgpu_kernel void @private_system_one_as_acquire_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15485,6 +20256,12 @@ define amdgpu_kernel void @private_system_one_as_acquire_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_system_one_as_acquire_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -15502,6 +20279,12 @@ define amdgpu_kernel void @private_system_one_as_acquire_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_system_one_as_acquire_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -15519,6 +20302,12 @@ define amdgpu_kernel void @private_system_one_as_acquire_acquire_cmpxchg( ; GFX11-WGP-LABEL: private_system_one_as_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -15534,6 +20323,12 @@ define amdgpu_kernel void @private_system_one_as_acquire_acquire_cmpxchg( ; GFX11-CU-LABEL: private_system_one_as_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -15549,6 +20344,12 @@ define amdgpu_kernel void @private_system_one_as_acquire_acquire_cmpxchg( ; GFX12-WGP-LABEL: private_system_one_as_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -15563,6 +20364,12 @@ define amdgpu_kernel void @private_system_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-LABEL: private_system_one_as_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -15577,26 +20384,32 @@ define amdgpu_kernel void @private_system_one_as_acquire_acquire_cmpxchg( ; GFX1250-LABEL: private_system_one_as_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -15604,6 +20417,7 @@ define amdgpu_kernel void @private_system_one_as_acquire_acquire_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -15626,6 +20440,12 @@ define amdgpu_kernel void @private_system_one_as_release_acquire_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -15646,6 +20466,12 @@ define amdgpu_kernel void @private_system_one_as_release_acquire_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -15666,6 +20492,12 @@ define amdgpu_kernel void @private_system_one_as_release_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -15683,6 +20515,12 @@ define amdgpu_kernel void @private_system_one_as_release_acquire_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -15704,6 +20542,12 @@ define amdgpu_kernel void @private_system_one_as_release_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -15724,6 +20568,12 @@ define amdgpu_kernel void @private_system_one_as_release_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15742,6 +20592,12 @@ define amdgpu_kernel void @private_system_one_as_release_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15758,6 +20614,12 @@ define amdgpu_kernel void @private_system_one_as_release_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_system_one_as_release_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -15775,6 +20637,12 @@ define amdgpu_kernel void @private_system_one_as_release_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_system_one_as_release_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -15792,6 +20660,12 @@ define amdgpu_kernel void @private_system_one_as_release_acquire_cmpxchg( ; GFX11-WGP-LABEL: private_system_one_as_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -15807,6 +20681,12 @@ define amdgpu_kernel void @private_system_one_as_release_acquire_cmpxchg( ; GFX11-CU-LABEL: private_system_one_as_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -15822,6 +20702,12 @@ define amdgpu_kernel void @private_system_one_as_release_acquire_cmpxchg( ; GFX12-WGP-LABEL: private_system_one_as_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -15836,6 +20722,12 @@ define amdgpu_kernel void @private_system_one_as_release_acquire_cmpxchg( ; GFX12-CU-LABEL: private_system_one_as_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -15850,26 +20742,32 @@ define amdgpu_kernel void @private_system_one_as_release_acquire_cmpxchg( ; GFX1250-LABEL: private_system_one_as_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -15877,6 +20775,7 @@ define amdgpu_kernel void @private_system_one_as_release_acquire_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -15904,6 +20803,12 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_acquire_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -15924,6 +20829,12 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -15944,6 +20855,12 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -15961,6 +20878,12 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_acquire_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -15982,6 +20905,12 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -16002,6 +20931,12 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16020,6 +20955,12 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16036,6 +20977,12 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_system_one_as_acq_rel_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -16053,6 +21000,12 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_system_one_as_acq_rel_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -16070,6 +21023,12 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_acquire_cmpxchg( ; GFX11-WGP-LABEL: private_system_one_as_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -16085,6 +21044,12 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_acquire_cmpxchg( ; GFX11-CU-LABEL: private_system_one_as_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -16100,6 +21065,12 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_acquire_cmpxchg( ; GFX12-WGP-LABEL: private_system_one_as_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -16114,6 +21085,12 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-LABEL: private_system_one_as_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -16128,26 +21105,32 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_acquire_cmpxchg( ; GFX1250-LABEL: private_system_one_as_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -16155,6 +21138,7 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_acquire_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -16182,6 +21166,12 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_acquire_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -16202,6 +21192,12 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -16222,6 +21218,12 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16239,6 +21241,12 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_acquire_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16260,6 +21268,12 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -16280,6 +21294,12 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16298,6 +21318,12 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16314,6 +21340,12 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_system_one_as_seq_cst_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -16331,6 +21363,12 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_system_one_as_seq_cst_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -16348,6 +21386,12 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_acquire_cmpxchg( ; GFX11-WGP-LABEL: private_system_one_as_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -16363,6 +21407,12 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_acquire_cmpxchg( ; GFX11-CU-LABEL: private_system_one_as_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -16378,6 +21428,12 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_acquire_cmpxchg( ; GFX12-WGP-LABEL: private_system_one_as_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -16392,6 +21448,12 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-LABEL: private_system_one_as_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -16406,26 +21468,32 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_acquire_cmpxchg( ; GFX1250-LABEL: private_system_one_as_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -16433,6 +21501,7 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_acquire_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -16460,6 +21529,12 @@ define amdgpu_kernel void @private_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -16480,6 +21555,12 @@ define amdgpu_kernel void @private_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -16500,6 +21581,12 @@ define amdgpu_kernel void @private_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16517,6 +21604,12 @@ define amdgpu_kernel void @private_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16538,6 +21631,12 @@ define amdgpu_kernel void @private_system_one_as_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -16558,6 +21657,12 @@ define amdgpu_kernel void @private_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16576,6 +21681,12 @@ define amdgpu_kernel void @private_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16592,6 +21703,12 @@ define amdgpu_kernel void @private_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -16609,6 +21726,12 @@ define amdgpu_kernel void @private_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -16626,6 +21749,12 @@ define amdgpu_kernel void @private_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: private_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -16641,6 +21770,12 @@ define amdgpu_kernel void @private_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX11-CU-LABEL: private_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -16656,6 +21791,12 @@ define amdgpu_kernel void @private_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: private_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -16670,6 +21811,12 @@ define amdgpu_kernel void @private_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-LABEL: private_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -16684,26 +21831,32 @@ define amdgpu_kernel void @private_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX1250-LABEL: private_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -16711,6 +21864,7 @@ define amdgpu_kernel void @private_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -16738,6 +21892,12 @@ define amdgpu_kernel void @private_system_one_as_acquire_seq_cst_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -16758,6 +21918,12 @@ define amdgpu_kernel void @private_system_one_as_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -16778,6 +21944,12 @@ define amdgpu_kernel void @private_system_one_as_acquire_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16795,6 +21967,12 @@ define amdgpu_kernel void @private_system_one_as_acquire_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16816,6 +21994,12 @@ define amdgpu_kernel void @private_system_one_as_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -16836,6 +22020,12 @@ define amdgpu_kernel void @private_system_one_as_acquire_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16854,6 +22044,12 @@ define amdgpu_kernel void @private_system_one_as_acquire_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16870,6 +22066,12 @@ define amdgpu_kernel void @private_system_one_as_acquire_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_system_one_as_acquire_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -16887,6 +22089,12 @@ define amdgpu_kernel void @private_system_one_as_acquire_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_system_one_as_acquire_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -16904,6 +22112,12 @@ define amdgpu_kernel void @private_system_one_as_acquire_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: private_system_one_as_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -16919,6 +22133,12 @@ define amdgpu_kernel void @private_system_one_as_acquire_seq_cst_cmpxchg( ; GFX11-CU-LABEL: private_system_one_as_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -16934,6 +22154,12 @@ define amdgpu_kernel void @private_system_one_as_acquire_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: private_system_one_as_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -16948,6 +22174,12 @@ define amdgpu_kernel void @private_system_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-LABEL: private_system_one_as_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -16962,26 +22194,32 @@ define amdgpu_kernel void @private_system_one_as_acquire_seq_cst_cmpxchg( ; GFX1250-LABEL: private_system_one_as_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -16989,6 +22227,7 @@ define amdgpu_kernel void @private_system_one_as_acquire_seq_cst_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -17016,6 +22255,12 @@ define amdgpu_kernel void @private_system_one_as_release_seq_cst_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -17036,6 +22281,12 @@ define amdgpu_kernel void @private_system_one_as_release_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -17056,6 +22307,12 @@ define amdgpu_kernel void @private_system_one_as_release_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17073,6 +22330,12 @@ define amdgpu_kernel void @private_system_one_as_release_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17094,6 +22357,12 @@ define amdgpu_kernel void @private_system_one_as_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -17114,6 +22383,12 @@ define amdgpu_kernel void @private_system_one_as_release_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17132,6 +22407,12 @@ define amdgpu_kernel void @private_system_one_as_release_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17148,6 +22429,12 @@ define amdgpu_kernel void @private_system_one_as_release_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_system_one_as_release_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -17165,6 +22452,12 @@ define amdgpu_kernel void @private_system_one_as_release_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_system_one_as_release_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -17182,6 +22475,12 @@ define amdgpu_kernel void @private_system_one_as_release_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: private_system_one_as_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -17197,6 +22496,12 @@ define amdgpu_kernel void @private_system_one_as_release_seq_cst_cmpxchg( ; GFX11-CU-LABEL: private_system_one_as_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -17212,6 +22517,12 @@ define amdgpu_kernel void @private_system_one_as_release_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: private_system_one_as_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -17226,6 +22537,12 @@ define amdgpu_kernel void @private_system_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-LABEL: private_system_one_as_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -17240,26 +22557,32 @@ define amdgpu_kernel void @private_system_one_as_release_seq_cst_cmpxchg( ; GFX1250-LABEL: private_system_one_as_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -17267,6 +22590,7 @@ define amdgpu_kernel void @private_system_one_as_release_seq_cst_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -17294,6 +22618,12 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -17314,6 +22644,12 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -17334,6 +22670,12 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17351,6 +22693,12 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17372,6 +22720,12 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -17392,6 +22746,12 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17410,6 +22770,12 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17426,6 +22792,12 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -17443,6 +22815,12 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -17460,6 +22838,12 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: private_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -17475,6 +22859,12 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX11-CU-LABEL: private_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -17490,6 +22880,12 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: private_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -17504,6 +22900,12 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-LABEL: private_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -17518,26 +22920,32 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX1250-LABEL: private_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -17545,6 +22953,7 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -17572,6 +22981,12 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -17592,6 +23007,12 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -17612,6 +23033,12 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17629,6 +23056,12 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17650,6 +23083,12 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -17670,6 +23109,12 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17688,6 +23133,12 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17704,6 +23155,12 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -17721,6 +23178,12 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -17738,6 +23201,12 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: private_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -17753,6 +23222,12 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX11-CU-LABEL: private_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -17768,6 +23243,12 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: private_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -17782,6 +23263,12 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-LABEL: private_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -17796,26 +23283,32 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX1250-LABEL: private_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -17823,6 +23316,7 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -17851,6 +23345,12 @@ define amdgpu_kernel void @private_system_one_as_monotonic_monotonic_ret_cmpxchg ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -17875,6 +23375,12 @@ define amdgpu_kernel void @private_system_one_as_monotonic_monotonic_ret_cmpxchg ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -17897,6 +23403,12 @@ define amdgpu_kernel void @private_system_one_as_monotonic_monotonic_ret_cmpxchg ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17916,6 +23428,12 @@ define amdgpu_kernel void @private_system_one_as_monotonic_monotonic_ret_cmpxchg ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17940,6 +23458,12 @@ define amdgpu_kernel void @private_system_one_as_monotonic_monotonic_ret_cmpxchg ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -17962,6 +23486,12 @@ define amdgpu_kernel void @private_system_one_as_monotonic_monotonic_ret_cmpxchg ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17982,6 +23512,12 @@ define amdgpu_kernel void @private_system_one_as_monotonic_monotonic_ret_cmpxchg ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18001,6 +23537,12 @@ define amdgpu_kernel void @private_system_one_as_monotonic_monotonic_ret_cmpxchg ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -18021,6 +23563,12 @@ define amdgpu_kernel void @private_system_one_as_monotonic_monotonic_ret_cmpxchg ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -18040,6 +23588,12 @@ define amdgpu_kernel void @private_system_one_as_monotonic_monotonic_ret_cmpxchg ; GFX11-WGP-LABEL: private_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -18056,6 +23610,12 @@ define amdgpu_kernel void @private_system_one_as_monotonic_monotonic_ret_cmpxchg ; GFX11-CU-LABEL: private_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -18072,6 +23632,12 @@ define amdgpu_kernel void @private_system_one_as_monotonic_monotonic_ret_cmpxchg ; GFX12-WGP-LABEL: private_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -18087,6 +23653,12 @@ define amdgpu_kernel void @private_system_one_as_monotonic_monotonic_ret_cmpxchg ; GFX12-CU-LABEL: private_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -18103,25 +23675,31 @@ define amdgpu_kernel void @private_system_one_as_monotonic_monotonic_ret_cmpxchg ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -18153,6 +23731,12 @@ define amdgpu_kernel void @private_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -18177,6 +23761,12 @@ define amdgpu_kernel void @private_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -18199,6 +23789,12 @@ define amdgpu_kernel void @private_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -18218,6 +23814,12 @@ define amdgpu_kernel void @private_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -18242,6 +23844,12 @@ define amdgpu_kernel void @private_system_one_as_acquire_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -18264,6 +23872,12 @@ define amdgpu_kernel void @private_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18284,6 +23898,12 @@ define amdgpu_kernel void @private_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18303,6 +23923,12 @@ define amdgpu_kernel void @private_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -18323,6 +23949,12 @@ define amdgpu_kernel void @private_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -18342,6 +23974,12 @@ define amdgpu_kernel void @private_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: private_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -18358,6 +23996,12 @@ define amdgpu_kernel void @private_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: private_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -18374,6 +24018,12 @@ define amdgpu_kernel void @private_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: private_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -18389,6 +24039,12 @@ define amdgpu_kernel void @private_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: private_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -18405,25 +24061,31 @@ define amdgpu_kernel void @private_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -18457,6 +24119,12 @@ define amdgpu_kernel void @private_system_one_as_release_monotonic_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -18481,6 +24149,12 @@ define amdgpu_kernel void @private_system_one_as_release_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -18503,6 +24177,12 @@ define amdgpu_kernel void @private_system_one_as_release_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -18522,6 +24202,12 @@ define amdgpu_kernel void @private_system_one_as_release_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -18546,6 +24232,12 @@ define amdgpu_kernel void @private_system_one_as_release_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -18568,6 +24260,12 @@ define amdgpu_kernel void @private_system_one_as_release_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18588,6 +24286,12 @@ define amdgpu_kernel void @private_system_one_as_release_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18607,6 +24311,12 @@ define amdgpu_kernel void @private_system_one_as_release_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -18627,6 +24337,12 @@ define amdgpu_kernel void @private_system_one_as_release_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -18646,6 +24362,12 @@ define amdgpu_kernel void @private_system_one_as_release_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: private_system_one_as_release_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -18662,6 +24384,12 @@ define amdgpu_kernel void @private_system_one_as_release_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: private_system_one_as_release_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -18678,6 +24406,12 @@ define amdgpu_kernel void @private_system_one_as_release_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: private_system_one_as_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -18693,6 +24427,12 @@ define amdgpu_kernel void @private_system_one_as_release_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: private_system_one_as_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -18709,25 +24449,31 @@ define amdgpu_kernel void @private_system_one_as_release_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -18764,6 +24510,12 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -18788,6 +24540,12 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -18810,6 +24568,12 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -18829,6 +24593,12 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -18853,6 +24623,12 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -18875,6 +24651,12 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18895,6 +24677,12 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18914,6 +24702,12 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -18934,6 +24728,12 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -18953,6 +24753,12 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: private_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -18969,6 +24775,12 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: private_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -18985,6 +24797,12 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: private_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -19000,6 +24818,12 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: private_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -19016,25 +24840,31 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -19073,6 +24903,12 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -19097,6 +24933,12 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -19119,6 +24961,12 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -19138,6 +24986,12 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -19162,6 +25016,12 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -19184,6 +25044,12 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19204,6 +25070,12 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19223,6 +25095,12 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -19243,6 +25121,12 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -19262,6 +25146,12 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: private_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -19278,6 +25168,12 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: private_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -19294,6 +25190,12 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: private_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -19309,6 +25211,12 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: private_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -19325,25 +25233,31 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -19382,6 +25296,12 @@ define amdgpu_kernel void @private_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -19406,6 +25326,12 @@ define amdgpu_kernel void @private_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -19428,6 +25354,12 @@ define amdgpu_kernel void @private_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -19447,6 +25379,12 @@ define amdgpu_kernel void @private_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -19471,6 +25409,12 @@ define amdgpu_kernel void @private_system_one_as_monotonic_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -19493,6 +25437,12 @@ define amdgpu_kernel void @private_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19513,6 +25463,12 @@ define amdgpu_kernel void @private_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19532,6 +25488,12 @@ define amdgpu_kernel void @private_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -19552,6 +25514,12 @@ define amdgpu_kernel void @private_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -19571,6 +25539,12 @@ define amdgpu_kernel void @private_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: private_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -19587,6 +25561,12 @@ define amdgpu_kernel void @private_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: private_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -19603,6 +25583,12 @@ define amdgpu_kernel void @private_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: private_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -19618,6 +25604,12 @@ define amdgpu_kernel void @private_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: private_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -19634,25 +25626,31 @@ define amdgpu_kernel void @private_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -19686,6 +25684,12 @@ define amdgpu_kernel void @private_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -19710,6 +25714,12 @@ define amdgpu_kernel void @private_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -19732,6 +25742,12 @@ define amdgpu_kernel void @private_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -19751,6 +25767,12 @@ define amdgpu_kernel void @private_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -19775,6 +25797,12 @@ define amdgpu_kernel void @private_system_one_as_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -19797,6 +25825,12 @@ define amdgpu_kernel void @private_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19817,6 +25851,12 @@ define amdgpu_kernel void @private_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19836,6 +25876,12 @@ define amdgpu_kernel void @private_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -19856,6 +25902,12 @@ define amdgpu_kernel void @private_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -19875,6 +25927,12 @@ define amdgpu_kernel void @private_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: private_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -19891,6 +25949,12 @@ define amdgpu_kernel void @private_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: private_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -19907,6 +25971,12 @@ define amdgpu_kernel void @private_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: private_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -19922,6 +25992,12 @@ define amdgpu_kernel void @private_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: private_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -19938,25 +26014,31 @@ define amdgpu_kernel void @private_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -19990,6 +26072,12 @@ define amdgpu_kernel void @private_system_one_as_release_acquire_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -20014,6 +26102,12 @@ define amdgpu_kernel void @private_system_one_as_release_acquire_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -20036,6 +26130,12 @@ define amdgpu_kernel void @private_system_one_as_release_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -20055,6 +26155,12 @@ define amdgpu_kernel void @private_system_one_as_release_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -20079,6 +26185,12 @@ define amdgpu_kernel void @private_system_one_as_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -20101,6 +26213,12 @@ define amdgpu_kernel void @private_system_one_as_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20121,6 +26239,12 @@ define amdgpu_kernel void @private_system_one_as_release_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20140,6 +26264,12 @@ define amdgpu_kernel void @private_system_one_as_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -20160,6 +26290,12 @@ define amdgpu_kernel void @private_system_one_as_release_acquire_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -20179,6 +26315,12 @@ define amdgpu_kernel void @private_system_one_as_release_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: private_system_one_as_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -20195,6 +26337,12 @@ define amdgpu_kernel void @private_system_one_as_release_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: private_system_one_as_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -20211,6 +26359,12 @@ define amdgpu_kernel void @private_system_one_as_release_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: private_system_one_as_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -20226,6 +26380,12 @@ define amdgpu_kernel void @private_system_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: private_system_one_as_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -20242,25 +26402,31 @@ define amdgpu_kernel void @private_system_one_as_release_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -20299,6 +26465,12 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -20323,6 +26495,12 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -20345,6 +26523,12 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -20364,6 +26548,12 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -20388,6 +26578,12 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -20410,6 +26606,12 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20430,6 +26632,12 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20449,6 +26657,12 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -20469,6 +26683,12 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -20488,6 +26708,12 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: private_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -20504,6 +26730,12 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: private_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -20520,6 +26752,12 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: private_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -20535,6 +26773,12 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: private_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -20551,25 +26795,31 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -20608,6 +26858,12 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -20632,6 +26888,12 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -20654,6 +26916,12 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -20673,6 +26941,12 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -20697,6 +26971,12 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -20719,6 +26999,12 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20739,6 +27025,12 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20758,6 +27050,12 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -20778,6 +27076,12 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -20797,6 +27101,12 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: private_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -20813,6 +27123,12 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: private_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -20829,6 +27145,12 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: private_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -20844,6 +27166,12 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: private_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -20860,25 +27188,31 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -20917,6 +27251,12 @@ define amdgpu_kernel void @private_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -20941,6 +27281,12 @@ define amdgpu_kernel void @private_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -20963,6 +27309,12 @@ define amdgpu_kernel void @private_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -20982,6 +27334,12 @@ define amdgpu_kernel void @private_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -21006,6 +27364,12 @@ define amdgpu_kernel void @private_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -21028,6 +27392,12 @@ define amdgpu_kernel void @private_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21048,6 +27418,12 @@ define amdgpu_kernel void @private_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21067,6 +27443,12 @@ define amdgpu_kernel void @private_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -21087,6 +27469,12 @@ define amdgpu_kernel void @private_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -21106,6 +27494,12 @@ define amdgpu_kernel void @private_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: private_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -21122,6 +27516,12 @@ define amdgpu_kernel void @private_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: private_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -21138,6 +27538,12 @@ define amdgpu_kernel void @private_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: private_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -21153,6 +27559,12 @@ define amdgpu_kernel void @private_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: private_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -21169,25 +27581,31 @@ define amdgpu_kernel void @private_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -21226,6 +27644,12 @@ define amdgpu_kernel void @private_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -21250,6 +27674,12 @@ define amdgpu_kernel void @private_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -21272,6 +27702,12 @@ define amdgpu_kernel void @private_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -21291,6 +27727,12 @@ define amdgpu_kernel void @private_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -21315,6 +27757,12 @@ define amdgpu_kernel void @private_system_one_as_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -21337,6 +27785,12 @@ define amdgpu_kernel void @private_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21357,6 +27811,12 @@ define amdgpu_kernel void @private_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21376,6 +27836,12 @@ define amdgpu_kernel void @private_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -21396,6 +27862,12 @@ define amdgpu_kernel void @private_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -21415,6 +27887,12 @@ define amdgpu_kernel void @private_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: private_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -21431,6 +27909,12 @@ define amdgpu_kernel void @private_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: private_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -21447,6 +27931,12 @@ define amdgpu_kernel void @private_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: private_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -21462,6 +27952,12 @@ define amdgpu_kernel void @private_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: private_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -21478,25 +27974,31 @@ define amdgpu_kernel void @private_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -21535,6 +28037,12 @@ define amdgpu_kernel void @private_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -21559,6 +28067,12 @@ define amdgpu_kernel void @private_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -21581,6 +28095,12 @@ define amdgpu_kernel void @private_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -21600,6 +28120,12 @@ define amdgpu_kernel void @private_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -21624,6 +28150,12 @@ define amdgpu_kernel void @private_system_one_as_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -21646,6 +28178,12 @@ define amdgpu_kernel void @private_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21666,6 +28204,12 @@ define amdgpu_kernel void @private_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21685,6 +28229,12 @@ define amdgpu_kernel void @private_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -21705,6 +28255,12 @@ define amdgpu_kernel void @private_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -21724,6 +28280,12 @@ define amdgpu_kernel void @private_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: private_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -21740,6 +28302,12 @@ define amdgpu_kernel void @private_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: private_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -21756,6 +28324,12 @@ define amdgpu_kernel void @private_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: private_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -21771,6 +28345,12 @@ define amdgpu_kernel void @private_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: private_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -21787,25 +28367,31 @@ define amdgpu_kernel void @private_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -21844,6 +28430,12 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -21868,6 +28460,12 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -21890,6 +28488,12 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -21909,6 +28513,12 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -21933,6 +28543,12 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -21955,6 +28571,12 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21975,6 +28597,12 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21994,6 +28622,12 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -22014,6 +28648,12 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -22033,6 +28673,12 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: private_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -22049,6 +28695,12 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: private_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -22065,6 +28717,12 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: private_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -22080,6 +28738,12 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: private_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -22096,25 +28760,31 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -22153,6 +28823,12 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -22177,6 +28853,12 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -22199,6 +28881,12 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -22218,6 +28906,12 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -22242,6 +28936,12 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -22264,6 +28964,12 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -22284,6 +28990,12 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -22303,6 +29015,12 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -22323,6 +29041,12 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -22342,6 +29066,12 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: private_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -22358,6 +29088,12 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: private_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -22374,6 +29110,12 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: private_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -22389,6 +29131,12 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: private_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -22405,25 +29153,31 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll index 2e9b915721a4..903d5ab3455c 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll @@ -20,6 +20,9 @@ define amdgpu_kernel void @private_volatile_load_0( ; GFX6-NEXT: s_add_u32 s12, s12, s11 ; GFX6-NEXT: s_addc_u32 s13, s13, 0 ; GFX6-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb ; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -44,12 +47,15 @@ define amdgpu_kernel void @private_volatile_load_0( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen glc -; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: flat_store_dword v[0:1], v2 @@ -59,13 +65,16 @@ define amdgpu_kernel void @private_volatile_load_0( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen glc dlc -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; @@ -73,13 +82,16 @@ define amdgpu_kernel void @private_volatile_load_0( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen glc dlc -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; @@ -92,6 +104,9 @@ define amdgpu_kernel void @private_volatile_load_0( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -111,62 +126,80 @@ define amdgpu_kernel void @private_volatile_load_0( ; ; GFX11-WGP-LABEL: private_volatile_load_0: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_load_b32 v1, off, s2 glc dlc -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_volatile_load_0: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_load_b32 v1, off, s2 glc dlc -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_volatile_load_0: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_load_b32 v1, off, s2 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_volatile_load_0: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_load_b32 v1, off, s2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: private_volatile_load_0: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: scratch_load_b32 v1, off, s2 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %in, ptr addrspace(1) %out) { @@ -186,6 +219,9 @@ define amdgpu_kernel void @private_volatile_load_1( ; GFX6-NEXT: s_add_u32 s12, s12, s11 ; GFX6-NEXT: s_addc_u32 s13, s13, 0 ; GFX6-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb ; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -212,14 +248,17 @@ define amdgpu_kernel void @private_volatile_load_1( ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 2 ; GFX7-NEXT: v_lshlrev_b32_e64 v0, s7, v0 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e64 v0, s[6:7], s6, v0 ; GFX7-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen glc -; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: flat_store_dword v[0:1], v2 @@ -230,13 +269,16 @@ define amdgpu_kernel void @private_volatile_load_1( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_lshl_add_u32 v1, v1, 2, s6 ; GFX10-WGP-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen glc dlc -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; @@ -245,13 +287,16 @@ define amdgpu_kernel void @private_volatile_load_1( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_lshl_add_u32 v1, v1, 2, s6 ; GFX10-CU-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen glc dlc -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; @@ -264,6 +309,9 @@ define amdgpu_kernel void @private_volatile_load_1( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -286,37 +334,47 @@ define amdgpu_kernel void @private_volatile_load_1( ; GFX11-WGP-LABEL: private_volatile_load_1: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_mov_b32 s3, 0x3ff ; GFX11-WGP-NEXT: v_and_b32_e64 v1, v1, s3 -; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_lshl_add_u32 v1, v1, 2, s2 ; GFX11-WGP-NEXT: scratch_load_b32 v1, v1, off glc dlc -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_volatile_load_1: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_mov_b32 s3, 0x3ff ; GFX11-CU-NEXT: v_and_b32_e64 v1, v1, s3 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_lshl_add_u32 v1, v1, 2, s2 ; GFX11-CU-NEXT: scratch_load_b32 v1, v1, off glc dlc -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_volatile_load_1: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_mov_b32 s3, 0x3ff @@ -324,18 +382,22 @@ define amdgpu_kernel void @private_volatile_load_1( ; GFX12-WGP-NEXT: s_mov_b32 s3, 2 ; GFX12-WGP-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-WGP-NEXT: v_lshlrev_b32_e64 v1, s3, v1 -; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_load_b32 v1, v1, s2 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_volatile_load_1: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: v_mov_b32_e32 v1, v0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_mov_b32 s3, 0x3ff @@ -343,11 +405,11 @@ define amdgpu_kernel void @private_volatile_load_1( ; GFX12-CU-NEXT: s_mov_b32 s3, 2 ; GFX12-CU-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-CU-NEXT: v_lshlrev_b32_e64 v1, s3, v1 -; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_load_b32 v1, v1, s2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; @@ -355,15 +417,19 @@ define amdgpu_kernel void @private_volatile_load_1( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: v_mov_b32_e32 v1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_mov_b32 s3, 0x3ff ; GFX1250-NEXT: v_and_b32_e64 v1, v1, s3 ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: scratch_load_b32 v1, v1, s2 scale_offset scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %in, ptr addrspace(1) %out) { @@ -384,9 +450,12 @@ define amdgpu_kernel void @private_volatile_store_0( ; GFX6-NEXT: s_mov_b32 s15, 0xe8f000 ; GFX6-NEXT: s_add_u32 s12, s12, s11 ; GFX6-NEXT: s_addc_u32 s13, s13, 0 -; GFX6-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x9 -; GFX6-NEXT: s_load_dword s0, s[4:5], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s0, s[4:5], 0xb +; GFX6-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x9 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s0, s[4:5], 0xb ; GFX6-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s1 @@ -399,9 +468,12 @@ define amdgpu_kernel void @private_volatile_store_0( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -414,9 +486,12 @@ define amdgpu_kernel void @private_volatile_store_0( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 @@ -429,9 +504,12 @@ define amdgpu_kernel void @private_volatile_store_0( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 @@ -448,9 +526,12 @@ define amdgpu_kernel void @private_volatile_store_0( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -461,9 +542,12 @@ define amdgpu_kernel void @private_volatile_store_0( ; ; GFX11-WGP-LABEL: private_volatile_store_0: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -473,9 +557,12 @@ define amdgpu_kernel void @private_volatile_store_0( ; ; GFX11-CU-LABEL: private_volatile_store_0: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -485,9 +572,12 @@ define amdgpu_kernel void @private_volatile_store_0( ; ; GFX12-WGP-LABEL: private_volatile_store_0: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -502,9 +592,12 @@ define amdgpu_kernel void @private_volatile_store_0( ; ; GFX12-CU-LABEL: private_volatile_store_0: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -520,9 +613,12 @@ define amdgpu_kernel void @private_volatile_store_0( ; GFX1250-LABEL: private_volatile_store_0: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s1, s[2:3], 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 @@ -546,6 +642,9 @@ define amdgpu_kernel void @private_volatile_store_1( ; GFX6-NEXT: s_mov_b32 s15, 0xe8f000 ; GFX6-NEXT: s_add_u32 s12, s12, s11 ; GFX6-NEXT: s_addc_u32 s13, s13, 0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s0, s[4:5], 0xb ; GFX6-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x9 ; GFX6-NEXT: s_load_dword s1, s[4:5], 0xb ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -563,6 +662,9 @@ define amdgpu_kernel void @private_volatile_store_1( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -580,6 +682,9 @@ define amdgpu_kernel void @private_volatile_store_1( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -595,6 +700,9 @@ define amdgpu_kernel void @private_volatile_store_1( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -614,6 +722,9 @@ define amdgpu_kernel void @private_volatile_store_1( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -629,6 +740,9 @@ define amdgpu_kernel void @private_volatile_store_1( ; ; GFX11-WGP-LABEL: private_volatile_store_1: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -644,6 +758,9 @@ define amdgpu_kernel void @private_volatile_store_1( ; ; GFX11-CU-LABEL: private_volatile_store_1: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -659,9 +776,12 @@ define amdgpu_kernel void @private_volatile_store_1( ; ; GFX12-WGP-LABEL: private_volatile_store_1: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 ; GFX12-WGP-NEXT: s_mov_b32 s2, 0x3ff ; GFX12-WGP-NEXT: v_and_b32_e64 v0, v0, s2 @@ -681,9 +801,12 @@ define amdgpu_kernel void @private_volatile_store_1( ; ; GFX12-CU-LABEL: private_volatile_store_1: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 ; GFX12-CU-NEXT: s_mov_b32 s2, 0x3ff ; GFX12-CU-NEXT: v_and_b32_e64 v0, v0, s2 @@ -704,9 +827,12 @@ define amdgpu_kernel void @private_volatile_store_1( ; GFX1250-LABEL: private_volatile_store_1: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s1, s[2:3], 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b32 s2, 0x3ff diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-wavefront.ll index 2bcb47b49d74..c59b6cc15ae9 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-wavefront.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-wavefront.ll @@ -19,11 +19,15 @@ define amdgpu_kernel void @private_wavefront_unordered_load( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -33,11 +37,15 @@ define amdgpu_kernel void @private_wavefront_unordered_load( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -47,11 +55,15 @@ define amdgpu_kernel void @private_wavefront_unordered_load( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -61,11 +73,15 @@ define amdgpu_kernel void @private_wavefront_unordered_load( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -79,11 +95,15 @@ define amdgpu_kernel void @private_wavefront_unordered_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -93,11 +113,15 @@ define amdgpu_kernel void @private_wavefront_unordered_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -107,11 +131,15 @@ define amdgpu_kernel void @private_wavefront_unordered_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -119,84 +147,108 @@ define amdgpu_kernel void @private_wavefront_unordered_load( ; ; GFX942-NOTTGSPLIT-LABEL: private_wavefront_unordered_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_wavefront_unordered_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_wavefront_unordered_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_wavefront_unordered_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_wavefront_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_wavefront_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: private_wavefront_unordered_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -205,6 +257,7 @@ define amdgpu_kernel void @private_wavefront_unordered_load( ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: flat_load_b32 v0, v[0:1] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %in, ptr addrspace(5) %out) { @@ -219,11 +272,15 @@ define amdgpu_kernel void @private_wavefront_monotonic_load( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -233,11 +290,15 @@ define amdgpu_kernel void @private_wavefront_monotonic_load( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -247,11 +308,15 @@ define amdgpu_kernel void @private_wavefront_monotonic_load( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -261,11 +326,15 @@ define amdgpu_kernel void @private_wavefront_monotonic_load( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -279,11 +348,15 @@ define amdgpu_kernel void @private_wavefront_monotonic_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -293,11 +366,15 @@ define amdgpu_kernel void @private_wavefront_monotonic_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -307,11 +384,15 @@ define amdgpu_kernel void @private_wavefront_monotonic_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -319,84 +400,108 @@ define amdgpu_kernel void @private_wavefront_monotonic_load( ; ; GFX942-NOTTGSPLIT-LABEL: private_wavefront_monotonic_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_wavefront_monotonic_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_wavefront_monotonic_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_wavefront_monotonic_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_wavefront_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_wavefront_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: private_wavefront_monotonic_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -405,6 +510,7 @@ define amdgpu_kernel void @private_wavefront_monotonic_load( ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: flat_load_b32 v0, v[0:1] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %in, ptr addrspace(5) %out) { @@ -419,11 +525,15 @@ define amdgpu_kernel void @private_wavefront_acquire_load( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -433,11 +543,15 @@ define amdgpu_kernel void @private_wavefront_acquire_load( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -447,11 +561,15 @@ define amdgpu_kernel void @private_wavefront_acquire_load( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -461,11 +579,15 @@ define amdgpu_kernel void @private_wavefront_acquire_load( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -479,11 +601,15 @@ define amdgpu_kernel void @private_wavefront_acquire_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -493,11 +619,15 @@ define amdgpu_kernel void @private_wavefront_acquire_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -507,11 +637,15 @@ define amdgpu_kernel void @private_wavefront_acquire_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -519,84 +653,108 @@ define amdgpu_kernel void @private_wavefront_acquire_load( ; ; GFX942-NOTTGSPLIT-LABEL: private_wavefront_acquire_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_wavefront_acquire_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_wavefront_acquire_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_wavefront_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_wavefront_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_wavefront_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: private_wavefront_acquire_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -605,6 +763,7 @@ define amdgpu_kernel void @private_wavefront_acquire_load( ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: flat_load_b32 v0, v[0:1] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %in, ptr addrspace(5) %out) { @@ -619,11 +778,15 @@ define amdgpu_kernel void @private_wavefront_seq_cst_load( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -633,11 +796,15 @@ define amdgpu_kernel void @private_wavefront_seq_cst_load( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -647,11 +814,15 @@ define amdgpu_kernel void @private_wavefront_seq_cst_load( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -661,11 +832,15 @@ define amdgpu_kernel void @private_wavefront_seq_cst_load( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -679,11 +854,15 @@ define amdgpu_kernel void @private_wavefront_seq_cst_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -693,11 +872,15 @@ define amdgpu_kernel void @private_wavefront_seq_cst_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -707,11 +890,15 @@ define amdgpu_kernel void @private_wavefront_seq_cst_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -719,84 +906,108 @@ define amdgpu_kernel void @private_wavefront_seq_cst_load( ; ; GFX942-NOTTGSPLIT-LABEL: private_wavefront_seq_cst_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_wavefront_seq_cst_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_wavefront_seq_cst_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_wavefront_seq_cst_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_wavefront_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_wavefront_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: private_wavefront_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -805,6 +1016,7 @@ define amdgpu_kernel void @private_wavefront_seq_cst_load( ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: flat_load_b32 v0, v[0:1] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %in, ptr addrspace(5) %out) { @@ -819,10 +1031,14 @@ define amdgpu_kernel void @private_wavefront_unordered_store( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX6-NEXT: s_endpgm @@ -831,10 +1047,14 @@ define amdgpu_kernel void @private_wavefront_unordered_store( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX7-NEXT: s_endpgm @@ -843,10 +1063,14 @@ define amdgpu_kernel void @private_wavefront_unordered_store( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-WGP-NEXT: s_endpgm @@ -855,10 +1079,14 @@ define amdgpu_kernel void @private_wavefront_unordered_store( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-CU-NEXT: s_endpgm @@ -871,10 +1099,14 @@ define amdgpu_kernel void @private_wavefront_unordered_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -883,10 +1115,14 @@ define amdgpu_kernel void @private_wavefront_unordered_store( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -895,65 +1131,93 @@ define amdgpu_kernel void @private_wavefront_unordered_store( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: private_wavefront_unordered_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_wavefront_unordered_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_wavefront_unordered_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_wavefront_unordered_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_wavefront_unordered_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_wavefront_unordered_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; @@ -961,22 +1225,26 @@ define amdgpu_kernel void @private_wavefront_unordered_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -998,10 +1266,14 @@ define amdgpu_kernel void @private_wavefront_monotonic_store( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX6-NEXT: s_endpgm @@ -1010,10 +1282,14 @@ define amdgpu_kernel void @private_wavefront_monotonic_store( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX7-NEXT: s_endpgm @@ -1022,10 +1298,14 @@ define amdgpu_kernel void @private_wavefront_monotonic_store( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-WGP-NEXT: s_endpgm @@ -1034,10 +1314,14 @@ define amdgpu_kernel void @private_wavefront_monotonic_store( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-CU-NEXT: s_endpgm @@ -1050,10 +1334,14 @@ define amdgpu_kernel void @private_wavefront_monotonic_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -1062,10 +1350,14 @@ define amdgpu_kernel void @private_wavefront_monotonic_store( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -1074,65 +1366,93 @@ define amdgpu_kernel void @private_wavefront_monotonic_store( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: private_wavefront_monotonic_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_wavefront_monotonic_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_wavefront_monotonic_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_wavefront_monotonic_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_wavefront_monotonic_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_wavefront_monotonic_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; @@ -1140,22 +1460,26 @@ define amdgpu_kernel void @private_wavefront_monotonic_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -1177,10 +1501,14 @@ define amdgpu_kernel void @private_wavefront_release_store( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX6-NEXT: s_endpgm @@ -1189,10 +1517,14 @@ define amdgpu_kernel void @private_wavefront_release_store( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX7-NEXT: s_endpgm @@ -1201,10 +1533,14 @@ define amdgpu_kernel void @private_wavefront_release_store( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-WGP-NEXT: s_endpgm @@ -1213,10 +1549,14 @@ define amdgpu_kernel void @private_wavefront_release_store( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-CU-NEXT: s_endpgm @@ -1229,10 +1569,14 @@ define amdgpu_kernel void @private_wavefront_release_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -1241,10 +1585,14 @@ define amdgpu_kernel void @private_wavefront_release_store( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -1253,65 +1601,93 @@ define amdgpu_kernel void @private_wavefront_release_store( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: private_wavefront_release_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_wavefront_release_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_wavefront_release_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_wavefront_release_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_wavefront_release_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_wavefront_release_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; @@ -1319,22 +1695,26 @@ define amdgpu_kernel void @private_wavefront_release_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -1356,10 +1736,14 @@ define amdgpu_kernel void @private_wavefront_seq_cst_store( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX6-NEXT: s_endpgm @@ -1368,10 +1752,14 @@ define amdgpu_kernel void @private_wavefront_seq_cst_store( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX7-NEXT: s_endpgm @@ -1380,10 +1768,14 @@ define amdgpu_kernel void @private_wavefront_seq_cst_store( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-WGP-NEXT: s_endpgm @@ -1392,10 +1784,14 @@ define amdgpu_kernel void @private_wavefront_seq_cst_store( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-CU-NEXT: s_endpgm @@ -1408,10 +1804,14 @@ define amdgpu_kernel void @private_wavefront_seq_cst_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -1420,10 +1820,14 @@ define amdgpu_kernel void @private_wavefront_seq_cst_store( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -1432,65 +1836,93 @@ define amdgpu_kernel void @private_wavefront_seq_cst_store( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: private_wavefront_seq_cst_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_wavefront_seq_cst_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_wavefront_seq_cst_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_wavefront_seq_cst_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_wavefront_seq_cst_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_wavefront_seq_cst_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; @@ -1498,22 +1930,26 @@ define amdgpu_kernel void @private_wavefront_seq_cst_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -1536,8 +1972,15 @@ define amdgpu_kernel void @private_wavefront_monotonic_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1548,8 +1991,15 @@ define amdgpu_kernel void @private_wavefront_monotonic_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1560,8 +2010,15 @@ define amdgpu_kernel void @private_wavefront_monotonic_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1572,8 +2029,15 @@ define amdgpu_kernel void @private_wavefront_monotonic_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1588,8 +2052,15 @@ define amdgpu_kernel void @private_wavefront_monotonic_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -1600,8 +2071,15 @@ define amdgpu_kernel void @private_wavefront_monotonic_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1612,8 +2090,15 @@ define amdgpu_kernel void @private_wavefront_monotonic_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1622,8 +2107,14 @@ define amdgpu_kernel void @private_wavefront_monotonic_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_wavefront_monotonic_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -1631,8 +2122,14 @@ define amdgpu_kernel void @private_wavefront_monotonic_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_wavefront_monotonic_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm @@ -1640,8 +2137,14 @@ define amdgpu_kernel void @private_wavefront_monotonic_atomicrmw( ; GFX11-WGP-LABEL: private_wavefront_monotonic_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm @@ -1649,8 +2152,14 @@ define amdgpu_kernel void @private_wavefront_monotonic_atomicrmw( ; GFX11-CU-LABEL: private_wavefront_monotonic_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm @@ -1658,8 +2167,14 @@ define amdgpu_kernel void @private_wavefront_monotonic_atomicrmw( ; GFX12-WGP-LABEL: private_wavefront_monotonic_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm @@ -1667,8 +2182,14 @@ define amdgpu_kernel void @private_wavefront_monotonic_atomicrmw( ; GFX12-CU-LABEL: private_wavefront_monotonic_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm @@ -1676,29 +2197,34 @@ define amdgpu_kernel void @private_wavefront_monotonic_atomicrmw( ; GFX1250-LABEL: private_wavefront_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 ; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 @@ -1715,8 +2241,15 @@ define amdgpu_kernel void @private_wavefront_acquire_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1727,8 +2260,15 @@ define amdgpu_kernel void @private_wavefront_acquire_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1739,8 +2279,15 @@ define amdgpu_kernel void @private_wavefront_acquire_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1751,8 +2298,15 @@ define amdgpu_kernel void @private_wavefront_acquire_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1767,8 +2321,15 @@ define amdgpu_kernel void @private_wavefront_acquire_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -1779,8 +2340,15 @@ define amdgpu_kernel void @private_wavefront_acquire_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1791,8 +2359,15 @@ define amdgpu_kernel void @private_wavefront_acquire_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1801,8 +2376,14 @@ define amdgpu_kernel void @private_wavefront_acquire_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_wavefront_acquire_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -1810,8 +2391,14 @@ define amdgpu_kernel void @private_wavefront_acquire_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_wavefront_acquire_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm @@ -1819,8 +2406,14 @@ define amdgpu_kernel void @private_wavefront_acquire_atomicrmw( ; GFX11-WGP-LABEL: private_wavefront_acquire_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm @@ -1828,8 +2421,14 @@ define amdgpu_kernel void @private_wavefront_acquire_atomicrmw( ; GFX11-CU-LABEL: private_wavefront_acquire_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm @@ -1837,8 +2436,14 @@ define amdgpu_kernel void @private_wavefront_acquire_atomicrmw( ; GFX12-WGP-LABEL: private_wavefront_acquire_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm @@ -1846,8 +2451,14 @@ define amdgpu_kernel void @private_wavefront_acquire_atomicrmw( ; GFX12-CU-LABEL: private_wavefront_acquire_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm @@ -1855,29 +2466,34 @@ define amdgpu_kernel void @private_wavefront_acquire_atomicrmw( ; GFX1250-LABEL: private_wavefront_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 ; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 @@ -1894,8 +2510,15 @@ define amdgpu_kernel void @private_wavefront_release_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1906,8 +2529,15 @@ define amdgpu_kernel void @private_wavefront_release_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1918,8 +2548,15 @@ define amdgpu_kernel void @private_wavefront_release_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1930,8 +2567,15 @@ define amdgpu_kernel void @private_wavefront_release_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1946,8 +2590,15 @@ define amdgpu_kernel void @private_wavefront_release_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -1958,8 +2609,15 @@ define amdgpu_kernel void @private_wavefront_release_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1970,8 +2628,15 @@ define amdgpu_kernel void @private_wavefront_release_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1980,8 +2645,14 @@ define amdgpu_kernel void @private_wavefront_release_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_wavefront_release_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -1989,8 +2660,14 @@ define amdgpu_kernel void @private_wavefront_release_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_wavefront_release_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm @@ -1998,8 +2675,14 @@ define amdgpu_kernel void @private_wavefront_release_atomicrmw( ; GFX11-WGP-LABEL: private_wavefront_release_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm @@ -2007,8 +2690,14 @@ define amdgpu_kernel void @private_wavefront_release_atomicrmw( ; GFX11-CU-LABEL: private_wavefront_release_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm @@ -2016,8 +2705,14 @@ define amdgpu_kernel void @private_wavefront_release_atomicrmw( ; GFX12-WGP-LABEL: private_wavefront_release_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm @@ -2025,8 +2720,14 @@ define amdgpu_kernel void @private_wavefront_release_atomicrmw( ; GFX12-CU-LABEL: private_wavefront_release_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm @@ -2034,29 +2735,34 @@ define amdgpu_kernel void @private_wavefront_release_atomicrmw( ; GFX1250-LABEL: private_wavefront_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 ; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 @@ -2073,8 +2779,15 @@ define amdgpu_kernel void @private_wavefront_acq_rel_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -2085,8 +2798,15 @@ define amdgpu_kernel void @private_wavefront_acq_rel_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -2097,8 +2817,15 @@ define amdgpu_kernel void @private_wavefront_acq_rel_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -2109,8 +2836,15 @@ define amdgpu_kernel void @private_wavefront_acq_rel_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -2125,8 +2859,15 @@ define amdgpu_kernel void @private_wavefront_acq_rel_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -2137,8 +2878,15 @@ define amdgpu_kernel void @private_wavefront_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -2149,8 +2897,15 @@ define amdgpu_kernel void @private_wavefront_acq_rel_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -2159,8 +2914,14 @@ define amdgpu_kernel void @private_wavefront_acq_rel_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_wavefront_acq_rel_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -2168,8 +2929,14 @@ define amdgpu_kernel void @private_wavefront_acq_rel_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_wavefront_acq_rel_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm @@ -2177,8 +2944,14 @@ define amdgpu_kernel void @private_wavefront_acq_rel_atomicrmw( ; GFX11-WGP-LABEL: private_wavefront_acq_rel_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm @@ -2186,8 +2959,14 @@ define amdgpu_kernel void @private_wavefront_acq_rel_atomicrmw( ; GFX11-CU-LABEL: private_wavefront_acq_rel_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm @@ -2195,8 +2974,14 @@ define amdgpu_kernel void @private_wavefront_acq_rel_atomicrmw( ; GFX12-WGP-LABEL: private_wavefront_acq_rel_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm @@ -2204,8 +2989,14 @@ define amdgpu_kernel void @private_wavefront_acq_rel_atomicrmw( ; GFX12-CU-LABEL: private_wavefront_acq_rel_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm @@ -2213,29 +3004,34 @@ define amdgpu_kernel void @private_wavefront_acq_rel_atomicrmw( ; GFX1250-LABEL: private_wavefront_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 ; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 @@ -2252,8 +3048,15 @@ define amdgpu_kernel void @private_wavefront_seq_cst_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -2264,8 +3067,15 @@ define amdgpu_kernel void @private_wavefront_seq_cst_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -2276,8 +3086,15 @@ define amdgpu_kernel void @private_wavefront_seq_cst_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -2288,8 +3105,15 @@ define amdgpu_kernel void @private_wavefront_seq_cst_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -2304,8 +3128,15 @@ define amdgpu_kernel void @private_wavefront_seq_cst_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -2316,8 +3147,15 @@ define amdgpu_kernel void @private_wavefront_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -2328,8 +3166,15 @@ define amdgpu_kernel void @private_wavefront_seq_cst_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -2338,8 +3183,14 @@ define amdgpu_kernel void @private_wavefront_seq_cst_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_wavefront_seq_cst_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -2347,8 +3198,14 @@ define amdgpu_kernel void @private_wavefront_seq_cst_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_wavefront_seq_cst_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm @@ -2356,8 +3213,14 @@ define amdgpu_kernel void @private_wavefront_seq_cst_atomicrmw( ; GFX11-WGP-LABEL: private_wavefront_seq_cst_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm @@ -2365,8 +3228,14 @@ define amdgpu_kernel void @private_wavefront_seq_cst_atomicrmw( ; GFX11-CU-LABEL: private_wavefront_seq_cst_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm @@ -2374,8 +3243,14 @@ define amdgpu_kernel void @private_wavefront_seq_cst_atomicrmw( ; GFX12-WGP-LABEL: private_wavefront_seq_cst_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm @@ -2383,8 +3258,14 @@ define amdgpu_kernel void @private_wavefront_seq_cst_atomicrmw( ; GFX12-CU-LABEL: private_wavefront_seq_cst_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm @@ -2392,29 +3273,34 @@ define amdgpu_kernel void @private_wavefront_seq_cst_atomicrmw( ; GFX1250-LABEL: private_wavefront_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 ; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 @@ -2431,6 +3317,10 @@ define amdgpu_kernel void @private_wavefront_acquire_ret_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -2449,6 +3339,10 @@ define amdgpu_kernel void @private_wavefront_acquire_ret_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -2466,6 +3360,10 @@ define amdgpu_kernel void @private_wavefront_acquire_ret_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -2483,6 +3381,10 @@ define amdgpu_kernel void @private_wavefront_acquire_ret_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -2504,6 +3406,10 @@ define amdgpu_kernel void @private_wavefront_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -2521,6 +3427,10 @@ define amdgpu_kernel void @private_wavefront_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -2538,6 +3448,10 @@ define amdgpu_kernel void @private_wavefront_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -2553,6 +3467,10 @@ define amdgpu_kernel void @private_wavefront_acquire_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_wavefront_acquire_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 @@ -2565,6 +3483,10 @@ define amdgpu_kernel void @private_wavefront_acquire_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_wavefront_acquire_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 @@ -2577,6 +3499,10 @@ define amdgpu_kernel void @private_wavefront_acquire_ret_atomicrmw( ; GFX11-WGP-LABEL: private_wavefront_acquire_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 @@ -2589,6 +3515,10 @@ define amdgpu_kernel void @private_wavefront_acquire_ret_atomicrmw( ; GFX11-CU-LABEL: private_wavefront_acquire_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 @@ -2601,6 +3531,10 @@ define amdgpu_kernel void @private_wavefront_acquire_ret_atomicrmw( ; GFX12-WGP-LABEL: private_wavefront_acquire_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 @@ -2613,6 +3547,10 @@ define amdgpu_kernel void @private_wavefront_acquire_ret_atomicrmw( ; GFX12-CU-LABEL: private_wavefront_acquire_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 @@ -2626,22 +3564,26 @@ define amdgpu_kernel void @private_wavefront_acquire_ret_atomicrmw( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_lg_u32 s0, s2 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b32 s2, 0 ; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s0, s3 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -2667,6 +3609,10 @@ define amdgpu_kernel void @private_wavefront_acq_rel_ret_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -2685,6 +3631,10 @@ define amdgpu_kernel void @private_wavefront_acq_rel_ret_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -2702,6 +3652,10 @@ define amdgpu_kernel void @private_wavefront_acq_rel_ret_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -2719,6 +3673,10 @@ define amdgpu_kernel void @private_wavefront_acq_rel_ret_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -2740,6 +3698,10 @@ define amdgpu_kernel void @private_wavefront_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -2757,6 +3719,10 @@ define amdgpu_kernel void @private_wavefront_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -2774,6 +3740,10 @@ define amdgpu_kernel void @private_wavefront_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -2789,6 +3759,10 @@ define amdgpu_kernel void @private_wavefront_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_wavefront_acq_rel_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 @@ -2801,6 +3775,10 @@ define amdgpu_kernel void @private_wavefront_acq_rel_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_wavefront_acq_rel_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 @@ -2813,6 +3791,10 @@ define amdgpu_kernel void @private_wavefront_acq_rel_ret_atomicrmw( ; GFX11-WGP-LABEL: private_wavefront_acq_rel_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 @@ -2825,6 +3807,10 @@ define amdgpu_kernel void @private_wavefront_acq_rel_ret_atomicrmw( ; GFX11-CU-LABEL: private_wavefront_acq_rel_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 @@ -2837,6 +3823,10 @@ define amdgpu_kernel void @private_wavefront_acq_rel_ret_atomicrmw( ; GFX12-WGP-LABEL: private_wavefront_acq_rel_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 @@ -2849,6 +3839,10 @@ define amdgpu_kernel void @private_wavefront_acq_rel_ret_atomicrmw( ; GFX12-CU-LABEL: private_wavefront_acq_rel_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 @@ -2862,22 +3856,26 @@ define amdgpu_kernel void @private_wavefront_acq_rel_ret_atomicrmw( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_lg_u32 s0, s2 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b32 s2, 0 ; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s0, s3 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -2903,6 +3901,10 @@ define amdgpu_kernel void @private_wavefront_seq_cst_ret_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -2921,6 +3923,10 @@ define amdgpu_kernel void @private_wavefront_seq_cst_ret_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -2938,6 +3944,10 @@ define amdgpu_kernel void @private_wavefront_seq_cst_ret_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -2955,6 +3965,10 @@ define amdgpu_kernel void @private_wavefront_seq_cst_ret_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -2976,6 +3990,10 @@ define amdgpu_kernel void @private_wavefront_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -2993,6 +4011,10 @@ define amdgpu_kernel void @private_wavefront_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -3010,6 +4032,10 @@ define amdgpu_kernel void @private_wavefront_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -3025,6 +4051,10 @@ define amdgpu_kernel void @private_wavefront_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_wavefront_seq_cst_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 @@ -3037,6 +4067,10 @@ define amdgpu_kernel void @private_wavefront_seq_cst_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_wavefront_seq_cst_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 @@ -3049,6 +4083,10 @@ define amdgpu_kernel void @private_wavefront_seq_cst_ret_atomicrmw( ; GFX11-WGP-LABEL: private_wavefront_seq_cst_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 @@ -3061,6 +4099,10 @@ define amdgpu_kernel void @private_wavefront_seq_cst_ret_atomicrmw( ; GFX11-CU-LABEL: private_wavefront_seq_cst_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 @@ -3073,6 +4115,10 @@ define amdgpu_kernel void @private_wavefront_seq_cst_ret_atomicrmw( ; GFX12-WGP-LABEL: private_wavefront_seq_cst_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 @@ -3085,6 +4131,10 @@ define amdgpu_kernel void @private_wavefront_seq_cst_ret_atomicrmw( ; GFX12-CU-LABEL: private_wavefront_seq_cst_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 @@ -3098,22 +4148,26 @@ define amdgpu_kernel void @private_wavefront_seq_cst_ret_atomicrmw( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_lg_u32 s0, s2 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b32 s2, 0 ; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s0, s3 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -3139,6 +4193,12 @@ define amdgpu_kernel void @private_wavefront_monotonic_monotonic_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -3159,6 +4219,12 @@ define amdgpu_kernel void @private_wavefront_monotonic_monotonic_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -3179,6 +4245,12 @@ define amdgpu_kernel void @private_wavefront_monotonic_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -3196,6 +4268,12 @@ define amdgpu_kernel void @private_wavefront_monotonic_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -3217,6 +4295,12 @@ define amdgpu_kernel void @private_wavefront_monotonic_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -3237,6 +4321,12 @@ define amdgpu_kernel void @private_wavefront_monotonic_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3255,6 +4345,12 @@ define amdgpu_kernel void @private_wavefront_monotonic_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3271,6 +4367,12 @@ define amdgpu_kernel void @private_wavefront_monotonic_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_wavefront_monotonic_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -3288,6 +4390,12 @@ define amdgpu_kernel void @private_wavefront_monotonic_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_wavefront_monotonic_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -3305,6 +4413,12 @@ define amdgpu_kernel void @private_wavefront_monotonic_monotonic_cmpxchg( ; GFX11-WGP-LABEL: private_wavefront_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -3320,6 +4434,12 @@ define amdgpu_kernel void @private_wavefront_monotonic_monotonic_cmpxchg( ; GFX11-CU-LABEL: private_wavefront_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -3335,6 +4455,12 @@ define amdgpu_kernel void @private_wavefront_monotonic_monotonic_cmpxchg( ; GFX12-WGP-LABEL: private_wavefront_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -3349,6 +4475,12 @@ define amdgpu_kernel void @private_wavefront_monotonic_monotonic_cmpxchg( ; GFX12-CU-LABEL: private_wavefront_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -3363,26 +4495,32 @@ define amdgpu_kernel void @private_wavefront_monotonic_monotonic_cmpxchg( ; GFX1250-LABEL: private_wavefront_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -3390,6 +4528,7 @@ define amdgpu_kernel void @private_wavefront_monotonic_monotonic_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -3409,6 +4548,12 @@ define amdgpu_kernel void @private_wavefront_acquire_monotonic_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -3429,6 +4574,12 @@ define amdgpu_kernel void @private_wavefront_acquire_monotonic_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -3449,6 +4600,12 @@ define amdgpu_kernel void @private_wavefront_acquire_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -3466,6 +4623,12 @@ define amdgpu_kernel void @private_wavefront_acquire_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -3487,6 +4650,12 @@ define amdgpu_kernel void @private_wavefront_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -3507,6 +4676,12 @@ define amdgpu_kernel void @private_wavefront_acquire_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3525,6 +4700,12 @@ define amdgpu_kernel void @private_wavefront_acquire_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3541,6 +4722,12 @@ define amdgpu_kernel void @private_wavefront_acquire_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_wavefront_acquire_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -3558,6 +4745,12 @@ define amdgpu_kernel void @private_wavefront_acquire_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_wavefront_acquire_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -3575,6 +4768,12 @@ define amdgpu_kernel void @private_wavefront_acquire_monotonic_cmpxchg( ; GFX11-WGP-LABEL: private_wavefront_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -3590,6 +4789,12 @@ define amdgpu_kernel void @private_wavefront_acquire_monotonic_cmpxchg( ; GFX11-CU-LABEL: private_wavefront_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -3605,6 +4810,12 @@ define amdgpu_kernel void @private_wavefront_acquire_monotonic_cmpxchg( ; GFX12-WGP-LABEL: private_wavefront_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -3619,6 +4830,12 @@ define amdgpu_kernel void @private_wavefront_acquire_monotonic_cmpxchg( ; GFX12-CU-LABEL: private_wavefront_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -3633,26 +4850,32 @@ define amdgpu_kernel void @private_wavefront_acquire_monotonic_cmpxchg( ; GFX1250-LABEL: private_wavefront_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -3660,6 +4883,7 @@ define amdgpu_kernel void @private_wavefront_acquire_monotonic_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -3679,6 +4903,12 @@ define amdgpu_kernel void @private_wavefront_release_monotonic_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -3699,6 +4929,12 @@ define amdgpu_kernel void @private_wavefront_release_monotonic_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -3719,6 +4955,12 @@ define amdgpu_kernel void @private_wavefront_release_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -3736,6 +4978,12 @@ define amdgpu_kernel void @private_wavefront_release_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -3757,6 +5005,12 @@ define amdgpu_kernel void @private_wavefront_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -3777,6 +5031,12 @@ define amdgpu_kernel void @private_wavefront_release_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3795,6 +5055,12 @@ define amdgpu_kernel void @private_wavefront_release_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3811,6 +5077,12 @@ define amdgpu_kernel void @private_wavefront_release_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_wavefront_release_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -3828,6 +5100,12 @@ define amdgpu_kernel void @private_wavefront_release_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_wavefront_release_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -3845,6 +5123,12 @@ define amdgpu_kernel void @private_wavefront_release_monotonic_cmpxchg( ; GFX11-WGP-LABEL: private_wavefront_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -3860,6 +5144,12 @@ define amdgpu_kernel void @private_wavefront_release_monotonic_cmpxchg( ; GFX11-CU-LABEL: private_wavefront_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -3875,6 +5165,12 @@ define amdgpu_kernel void @private_wavefront_release_monotonic_cmpxchg( ; GFX12-WGP-LABEL: private_wavefront_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -3889,6 +5185,12 @@ define amdgpu_kernel void @private_wavefront_release_monotonic_cmpxchg( ; GFX12-CU-LABEL: private_wavefront_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -3903,26 +5205,32 @@ define amdgpu_kernel void @private_wavefront_release_monotonic_cmpxchg( ; GFX1250-LABEL: private_wavefront_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -3930,6 +5238,7 @@ define amdgpu_kernel void @private_wavefront_release_monotonic_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -3949,6 +5258,12 @@ define amdgpu_kernel void @private_wavefront_acq_rel_monotonic_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -3969,6 +5284,12 @@ define amdgpu_kernel void @private_wavefront_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -3989,6 +5310,12 @@ define amdgpu_kernel void @private_wavefront_acq_rel_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4006,6 +5333,12 @@ define amdgpu_kernel void @private_wavefront_acq_rel_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4027,6 +5360,12 @@ define amdgpu_kernel void @private_wavefront_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -4047,6 +5386,12 @@ define amdgpu_kernel void @private_wavefront_acq_rel_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4065,6 +5410,12 @@ define amdgpu_kernel void @private_wavefront_acq_rel_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4081,6 +5432,12 @@ define amdgpu_kernel void @private_wavefront_acq_rel_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_wavefront_acq_rel_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -4098,6 +5455,12 @@ define amdgpu_kernel void @private_wavefront_acq_rel_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_wavefront_acq_rel_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -4115,6 +5478,12 @@ define amdgpu_kernel void @private_wavefront_acq_rel_monotonic_cmpxchg( ; GFX11-WGP-LABEL: private_wavefront_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -4130,6 +5499,12 @@ define amdgpu_kernel void @private_wavefront_acq_rel_monotonic_cmpxchg( ; GFX11-CU-LABEL: private_wavefront_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -4145,6 +5520,12 @@ define amdgpu_kernel void @private_wavefront_acq_rel_monotonic_cmpxchg( ; GFX12-WGP-LABEL: private_wavefront_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -4159,6 +5540,12 @@ define amdgpu_kernel void @private_wavefront_acq_rel_monotonic_cmpxchg( ; GFX12-CU-LABEL: private_wavefront_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -4173,26 +5560,32 @@ define amdgpu_kernel void @private_wavefront_acq_rel_monotonic_cmpxchg( ; GFX1250-LABEL: private_wavefront_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -4200,6 +5593,7 @@ define amdgpu_kernel void @private_wavefront_acq_rel_monotonic_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -4219,6 +5613,12 @@ define amdgpu_kernel void @private_wavefront_seq_cst_monotonic_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -4239,6 +5639,12 @@ define amdgpu_kernel void @private_wavefront_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -4259,6 +5665,12 @@ define amdgpu_kernel void @private_wavefront_seq_cst_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4276,6 +5688,12 @@ define amdgpu_kernel void @private_wavefront_seq_cst_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4297,6 +5715,12 @@ define amdgpu_kernel void @private_wavefront_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -4317,6 +5741,12 @@ define amdgpu_kernel void @private_wavefront_seq_cst_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4335,6 +5765,12 @@ define amdgpu_kernel void @private_wavefront_seq_cst_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4351,6 +5787,12 @@ define amdgpu_kernel void @private_wavefront_seq_cst_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_wavefront_seq_cst_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -4368,6 +5810,12 @@ define amdgpu_kernel void @private_wavefront_seq_cst_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_wavefront_seq_cst_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -4385,6 +5833,12 @@ define amdgpu_kernel void @private_wavefront_seq_cst_monotonic_cmpxchg( ; GFX11-WGP-LABEL: private_wavefront_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -4400,6 +5854,12 @@ define amdgpu_kernel void @private_wavefront_seq_cst_monotonic_cmpxchg( ; GFX11-CU-LABEL: private_wavefront_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -4415,6 +5875,12 @@ define amdgpu_kernel void @private_wavefront_seq_cst_monotonic_cmpxchg( ; GFX12-WGP-LABEL: private_wavefront_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -4429,6 +5895,12 @@ define amdgpu_kernel void @private_wavefront_seq_cst_monotonic_cmpxchg( ; GFX12-CU-LABEL: private_wavefront_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -4443,26 +5915,32 @@ define amdgpu_kernel void @private_wavefront_seq_cst_monotonic_cmpxchg( ; GFX1250-LABEL: private_wavefront_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -4470,6 +5948,7 @@ define amdgpu_kernel void @private_wavefront_seq_cst_monotonic_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -4489,6 +5968,12 @@ define amdgpu_kernel void @private_wavefront_monotonic_acquire_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -4509,6 +5994,12 @@ define amdgpu_kernel void @private_wavefront_monotonic_acquire_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -4529,6 +6020,12 @@ define amdgpu_kernel void @private_wavefront_monotonic_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4546,6 +6043,12 @@ define amdgpu_kernel void @private_wavefront_monotonic_acquire_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4567,6 +6070,12 @@ define amdgpu_kernel void @private_wavefront_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -4587,6 +6096,12 @@ define amdgpu_kernel void @private_wavefront_monotonic_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4605,6 +6120,12 @@ define amdgpu_kernel void @private_wavefront_monotonic_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4621,6 +6142,12 @@ define amdgpu_kernel void @private_wavefront_monotonic_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_wavefront_monotonic_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -4638,6 +6165,12 @@ define amdgpu_kernel void @private_wavefront_monotonic_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_wavefront_monotonic_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -4655,6 +6188,12 @@ define amdgpu_kernel void @private_wavefront_monotonic_acquire_cmpxchg( ; GFX11-WGP-LABEL: private_wavefront_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -4670,6 +6209,12 @@ define amdgpu_kernel void @private_wavefront_monotonic_acquire_cmpxchg( ; GFX11-CU-LABEL: private_wavefront_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -4685,6 +6230,12 @@ define amdgpu_kernel void @private_wavefront_monotonic_acquire_cmpxchg( ; GFX12-WGP-LABEL: private_wavefront_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -4699,6 +6250,12 @@ define amdgpu_kernel void @private_wavefront_monotonic_acquire_cmpxchg( ; GFX12-CU-LABEL: private_wavefront_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -4713,26 +6270,32 @@ define amdgpu_kernel void @private_wavefront_monotonic_acquire_cmpxchg( ; GFX1250-LABEL: private_wavefront_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -4740,6 +6303,7 @@ define amdgpu_kernel void @private_wavefront_monotonic_acquire_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -4759,6 +6323,12 @@ define amdgpu_kernel void @private_wavefront_acquire_acquire_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -4779,6 +6349,12 @@ define amdgpu_kernel void @private_wavefront_acquire_acquire_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -4799,6 +6375,12 @@ define amdgpu_kernel void @private_wavefront_acquire_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4816,6 +6398,12 @@ define amdgpu_kernel void @private_wavefront_acquire_acquire_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4837,6 +6425,12 @@ define amdgpu_kernel void @private_wavefront_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -4857,6 +6451,12 @@ define amdgpu_kernel void @private_wavefront_acquire_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4875,6 +6475,12 @@ define amdgpu_kernel void @private_wavefront_acquire_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4891,6 +6497,12 @@ define amdgpu_kernel void @private_wavefront_acquire_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_wavefront_acquire_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -4908,6 +6520,12 @@ define amdgpu_kernel void @private_wavefront_acquire_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_wavefront_acquire_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -4925,6 +6543,12 @@ define amdgpu_kernel void @private_wavefront_acquire_acquire_cmpxchg( ; GFX11-WGP-LABEL: private_wavefront_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -4940,6 +6564,12 @@ define amdgpu_kernel void @private_wavefront_acquire_acquire_cmpxchg( ; GFX11-CU-LABEL: private_wavefront_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -4955,6 +6585,12 @@ define amdgpu_kernel void @private_wavefront_acquire_acquire_cmpxchg( ; GFX12-WGP-LABEL: private_wavefront_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -4969,6 +6605,12 @@ define amdgpu_kernel void @private_wavefront_acquire_acquire_cmpxchg( ; GFX12-CU-LABEL: private_wavefront_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -4983,26 +6625,32 @@ define amdgpu_kernel void @private_wavefront_acquire_acquire_cmpxchg( ; GFX1250-LABEL: private_wavefront_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -5010,6 +6658,7 @@ define amdgpu_kernel void @private_wavefront_acquire_acquire_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -5029,6 +6678,12 @@ define amdgpu_kernel void @private_wavefront_release_acquire_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -5049,6 +6704,12 @@ define amdgpu_kernel void @private_wavefront_release_acquire_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -5069,6 +6730,12 @@ define amdgpu_kernel void @private_wavefront_release_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5086,6 +6753,12 @@ define amdgpu_kernel void @private_wavefront_release_acquire_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5107,6 +6780,12 @@ define amdgpu_kernel void @private_wavefront_release_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -5127,6 +6806,12 @@ define amdgpu_kernel void @private_wavefront_release_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5145,6 +6830,12 @@ define amdgpu_kernel void @private_wavefront_release_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5161,6 +6852,12 @@ define amdgpu_kernel void @private_wavefront_release_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_wavefront_release_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -5178,6 +6875,12 @@ define amdgpu_kernel void @private_wavefront_release_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_wavefront_release_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -5195,6 +6898,12 @@ define amdgpu_kernel void @private_wavefront_release_acquire_cmpxchg( ; GFX11-WGP-LABEL: private_wavefront_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -5210,6 +6919,12 @@ define amdgpu_kernel void @private_wavefront_release_acquire_cmpxchg( ; GFX11-CU-LABEL: private_wavefront_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -5225,6 +6940,12 @@ define amdgpu_kernel void @private_wavefront_release_acquire_cmpxchg( ; GFX12-WGP-LABEL: private_wavefront_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -5239,6 +6960,12 @@ define amdgpu_kernel void @private_wavefront_release_acquire_cmpxchg( ; GFX12-CU-LABEL: private_wavefront_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -5253,26 +6980,32 @@ define amdgpu_kernel void @private_wavefront_release_acquire_cmpxchg( ; GFX1250-LABEL: private_wavefront_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -5280,6 +7013,7 @@ define amdgpu_kernel void @private_wavefront_release_acquire_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -5299,6 +7033,12 @@ define amdgpu_kernel void @private_wavefront_acq_rel_acquire_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -5319,6 +7059,12 @@ define amdgpu_kernel void @private_wavefront_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -5339,6 +7085,12 @@ define amdgpu_kernel void @private_wavefront_acq_rel_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5356,6 +7108,12 @@ define amdgpu_kernel void @private_wavefront_acq_rel_acquire_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5377,6 +7135,12 @@ define amdgpu_kernel void @private_wavefront_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -5397,6 +7161,12 @@ define amdgpu_kernel void @private_wavefront_acq_rel_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5415,6 +7185,12 @@ define amdgpu_kernel void @private_wavefront_acq_rel_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5431,6 +7207,12 @@ define amdgpu_kernel void @private_wavefront_acq_rel_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_wavefront_acq_rel_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -5448,6 +7230,12 @@ define amdgpu_kernel void @private_wavefront_acq_rel_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_wavefront_acq_rel_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -5465,6 +7253,12 @@ define amdgpu_kernel void @private_wavefront_acq_rel_acquire_cmpxchg( ; GFX11-WGP-LABEL: private_wavefront_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -5480,6 +7274,12 @@ define amdgpu_kernel void @private_wavefront_acq_rel_acquire_cmpxchg( ; GFX11-CU-LABEL: private_wavefront_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -5495,6 +7295,12 @@ define amdgpu_kernel void @private_wavefront_acq_rel_acquire_cmpxchg( ; GFX12-WGP-LABEL: private_wavefront_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -5509,6 +7315,12 @@ define amdgpu_kernel void @private_wavefront_acq_rel_acquire_cmpxchg( ; GFX12-CU-LABEL: private_wavefront_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -5523,26 +7335,32 @@ define amdgpu_kernel void @private_wavefront_acq_rel_acquire_cmpxchg( ; GFX1250-LABEL: private_wavefront_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -5550,6 +7368,7 @@ define amdgpu_kernel void @private_wavefront_acq_rel_acquire_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -5569,6 +7388,12 @@ define amdgpu_kernel void @private_wavefront_seq_cst_acquire_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -5589,6 +7414,12 @@ define amdgpu_kernel void @private_wavefront_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -5609,6 +7440,12 @@ define amdgpu_kernel void @private_wavefront_seq_cst_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5626,6 +7463,12 @@ define amdgpu_kernel void @private_wavefront_seq_cst_acquire_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5647,6 +7490,12 @@ define amdgpu_kernel void @private_wavefront_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -5667,6 +7516,12 @@ define amdgpu_kernel void @private_wavefront_seq_cst_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5685,6 +7540,12 @@ define amdgpu_kernel void @private_wavefront_seq_cst_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5701,6 +7562,12 @@ define amdgpu_kernel void @private_wavefront_seq_cst_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_wavefront_seq_cst_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -5718,6 +7585,12 @@ define amdgpu_kernel void @private_wavefront_seq_cst_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_wavefront_seq_cst_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -5735,6 +7608,12 @@ define amdgpu_kernel void @private_wavefront_seq_cst_acquire_cmpxchg( ; GFX11-WGP-LABEL: private_wavefront_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -5750,6 +7629,12 @@ define amdgpu_kernel void @private_wavefront_seq_cst_acquire_cmpxchg( ; GFX11-CU-LABEL: private_wavefront_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -5765,6 +7650,12 @@ define amdgpu_kernel void @private_wavefront_seq_cst_acquire_cmpxchg( ; GFX12-WGP-LABEL: private_wavefront_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -5779,6 +7670,12 @@ define amdgpu_kernel void @private_wavefront_seq_cst_acquire_cmpxchg( ; GFX12-CU-LABEL: private_wavefront_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -5793,26 +7690,32 @@ define amdgpu_kernel void @private_wavefront_seq_cst_acquire_cmpxchg( ; GFX1250-LABEL: private_wavefront_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -5820,6 +7723,7 @@ define amdgpu_kernel void @private_wavefront_seq_cst_acquire_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -5839,6 +7743,12 @@ define amdgpu_kernel void @private_wavefront_monotonic_seq_cst_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -5859,6 +7769,12 @@ define amdgpu_kernel void @private_wavefront_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -5879,6 +7795,12 @@ define amdgpu_kernel void @private_wavefront_monotonic_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5896,6 +7818,12 @@ define amdgpu_kernel void @private_wavefront_monotonic_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5917,6 +7845,12 @@ define amdgpu_kernel void @private_wavefront_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -5937,6 +7871,12 @@ define amdgpu_kernel void @private_wavefront_monotonic_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5955,6 +7895,12 @@ define amdgpu_kernel void @private_wavefront_monotonic_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5971,6 +7917,12 @@ define amdgpu_kernel void @private_wavefront_monotonic_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_wavefront_monotonic_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -5988,6 +7940,12 @@ define amdgpu_kernel void @private_wavefront_monotonic_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_wavefront_monotonic_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -6005,6 +7963,12 @@ define amdgpu_kernel void @private_wavefront_monotonic_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: private_wavefront_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -6020,6 +7984,12 @@ define amdgpu_kernel void @private_wavefront_monotonic_seq_cst_cmpxchg( ; GFX11-CU-LABEL: private_wavefront_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -6035,6 +8005,12 @@ define amdgpu_kernel void @private_wavefront_monotonic_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: private_wavefront_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -6049,6 +8025,12 @@ define amdgpu_kernel void @private_wavefront_monotonic_seq_cst_cmpxchg( ; GFX12-CU-LABEL: private_wavefront_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -6063,26 +8045,32 @@ define amdgpu_kernel void @private_wavefront_monotonic_seq_cst_cmpxchg( ; GFX1250-LABEL: private_wavefront_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -6090,6 +8078,7 @@ define amdgpu_kernel void @private_wavefront_monotonic_seq_cst_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -6109,6 +8098,12 @@ define amdgpu_kernel void @private_wavefront_acquire_seq_cst_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -6129,6 +8124,12 @@ define amdgpu_kernel void @private_wavefront_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -6149,6 +8150,12 @@ define amdgpu_kernel void @private_wavefront_acquire_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -6166,6 +8173,12 @@ define amdgpu_kernel void @private_wavefront_acquire_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6187,6 +8200,12 @@ define amdgpu_kernel void @private_wavefront_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -6207,6 +8226,12 @@ define amdgpu_kernel void @private_wavefront_acquire_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6225,6 +8250,12 @@ define amdgpu_kernel void @private_wavefront_acquire_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6241,6 +8272,12 @@ define amdgpu_kernel void @private_wavefront_acquire_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_wavefront_acquire_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -6258,6 +8295,12 @@ define amdgpu_kernel void @private_wavefront_acquire_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_wavefront_acquire_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -6275,6 +8318,12 @@ define amdgpu_kernel void @private_wavefront_acquire_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: private_wavefront_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -6290,6 +8339,12 @@ define amdgpu_kernel void @private_wavefront_acquire_seq_cst_cmpxchg( ; GFX11-CU-LABEL: private_wavefront_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -6305,6 +8360,12 @@ define amdgpu_kernel void @private_wavefront_acquire_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: private_wavefront_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -6319,6 +8380,12 @@ define amdgpu_kernel void @private_wavefront_acquire_seq_cst_cmpxchg( ; GFX12-CU-LABEL: private_wavefront_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -6333,26 +8400,32 @@ define amdgpu_kernel void @private_wavefront_acquire_seq_cst_cmpxchg( ; GFX1250-LABEL: private_wavefront_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -6360,6 +8433,7 @@ define amdgpu_kernel void @private_wavefront_acquire_seq_cst_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -6379,6 +8453,12 @@ define amdgpu_kernel void @private_wavefront_release_seq_cst_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -6399,6 +8479,12 @@ define amdgpu_kernel void @private_wavefront_release_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -6419,6 +8505,12 @@ define amdgpu_kernel void @private_wavefront_release_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -6436,6 +8528,12 @@ define amdgpu_kernel void @private_wavefront_release_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6457,6 +8555,12 @@ define amdgpu_kernel void @private_wavefront_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -6477,6 +8581,12 @@ define amdgpu_kernel void @private_wavefront_release_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6495,6 +8605,12 @@ define amdgpu_kernel void @private_wavefront_release_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6511,6 +8627,12 @@ define amdgpu_kernel void @private_wavefront_release_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_wavefront_release_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -6528,6 +8650,12 @@ define amdgpu_kernel void @private_wavefront_release_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_wavefront_release_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -6545,6 +8673,12 @@ define amdgpu_kernel void @private_wavefront_release_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: private_wavefront_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -6560,6 +8694,12 @@ define amdgpu_kernel void @private_wavefront_release_seq_cst_cmpxchg( ; GFX11-CU-LABEL: private_wavefront_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -6575,6 +8715,12 @@ define amdgpu_kernel void @private_wavefront_release_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: private_wavefront_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -6589,6 +8735,12 @@ define amdgpu_kernel void @private_wavefront_release_seq_cst_cmpxchg( ; GFX12-CU-LABEL: private_wavefront_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -6603,26 +8755,32 @@ define amdgpu_kernel void @private_wavefront_release_seq_cst_cmpxchg( ; GFX1250-LABEL: private_wavefront_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -6630,6 +8788,7 @@ define amdgpu_kernel void @private_wavefront_release_seq_cst_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -6649,6 +8808,12 @@ define amdgpu_kernel void @private_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -6669,6 +8834,12 @@ define amdgpu_kernel void @private_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -6689,6 +8860,12 @@ define amdgpu_kernel void @private_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -6706,6 +8883,12 @@ define amdgpu_kernel void @private_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6727,6 +8910,12 @@ define amdgpu_kernel void @private_wavefront_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -6747,6 +8936,12 @@ define amdgpu_kernel void @private_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6765,6 +8960,12 @@ define amdgpu_kernel void @private_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6781,6 +8982,12 @@ define amdgpu_kernel void @private_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -6798,6 +9005,12 @@ define amdgpu_kernel void @private_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -6815,6 +9028,12 @@ define amdgpu_kernel void @private_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: private_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -6830,6 +9049,12 @@ define amdgpu_kernel void @private_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX11-CU-LABEL: private_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -6845,6 +9070,12 @@ define amdgpu_kernel void @private_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: private_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -6859,6 +9090,12 @@ define amdgpu_kernel void @private_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-LABEL: private_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -6873,26 +9110,32 @@ define amdgpu_kernel void @private_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX1250-LABEL: private_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -6900,6 +9143,7 @@ define amdgpu_kernel void @private_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -6919,6 +9163,12 @@ define amdgpu_kernel void @private_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -6939,6 +9189,12 @@ define amdgpu_kernel void @private_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -6959,6 +9215,12 @@ define amdgpu_kernel void @private_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -6976,6 +9238,12 @@ define amdgpu_kernel void @private_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6997,6 +9265,12 @@ define amdgpu_kernel void @private_wavefront_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -7017,6 +9291,12 @@ define amdgpu_kernel void @private_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7035,6 +9315,12 @@ define amdgpu_kernel void @private_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7051,6 +9337,12 @@ define amdgpu_kernel void @private_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -7068,6 +9360,12 @@ define amdgpu_kernel void @private_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -7085,6 +9383,12 @@ define amdgpu_kernel void @private_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: private_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -7100,6 +9404,12 @@ define amdgpu_kernel void @private_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX11-CU-LABEL: private_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -7115,6 +9425,12 @@ define amdgpu_kernel void @private_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: private_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -7129,6 +9445,12 @@ define amdgpu_kernel void @private_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-LABEL: private_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -7143,26 +9465,32 @@ define amdgpu_kernel void @private_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX1250-LABEL: private_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -7170,6 +9498,7 @@ define amdgpu_kernel void @private_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -7190,6 +9519,12 @@ define amdgpu_kernel void @private_wavefront_monotonic_monotonic_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -7214,6 +9549,12 @@ define amdgpu_kernel void @private_wavefront_monotonic_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -7236,6 +9577,12 @@ define amdgpu_kernel void @private_wavefront_monotonic_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7255,6 +9602,12 @@ define amdgpu_kernel void @private_wavefront_monotonic_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -7279,6 +9632,12 @@ define amdgpu_kernel void @private_wavefront_monotonic_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -7301,6 +9660,12 @@ define amdgpu_kernel void @private_wavefront_monotonic_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7321,6 +9686,12 @@ define amdgpu_kernel void @private_wavefront_monotonic_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7340,6 +9711,12 @@ define amdgpu_kernel void @private_wavefront_monotonic_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -7360,6 +9737,12 @@ define amdgpu_kernel void @private_wavefront_monotonic_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -7379,6 +9762,12 @@ define amdgpu_kernel void @private_wavefront_monotonic_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: private_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -7395,6 +9784,12 @@ define amdgpu_kernel void @private_wavefront_monotonic_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: private_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -7411,6 +9806,12 @@ define amdgpu_kernel void @private_wavefront_monotonic_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: private_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -7426,6 +9827,12 @@ define amdgpu_kernel void @private_wavefront_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: private_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -7442,25 +9849,31 @@ define amdgpu_kernel void @private_wavefront_monotonic_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -7492,6 +9905,12 @@ define amdgpu_kernel void @private_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -7516,6 +9935,12 @@ define amdgpu_kernel void @private_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -7538,6 +9963,12 @@ define amdgpu_kernel void @private_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7557,6 +9988,12 @@ define amdgpu_kernel void @private_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -7581,6 +10018,12 @@ define amdgpu_kernel void @private_wavefront_acquire_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -7603,6 +10046,12 @@ define amdgpu_kernel void @private_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7623,6 +10072,12 @@ define amdgpu_kernel void @private_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7642,6 +10097,12 @@ define amdgpu_kernel void @private_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -7662,6 +10123,12 @@ define amdgpu_kernel void @private_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -7681,6 +10148,12 @@ define amdgpu_kernel void @private_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: private_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -7697,6 +10170,12 @@ define amdgpu_kernel void @private_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: private_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -7713,6 +10192,12 @@ define amdgpu_kernel void @private_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: private_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -7728,6 +10213,12 @@ define amdgpu_kernel void @private_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: private_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -7744,25 +10235,31 @@ define amdgpu_kernel void @private_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -7794,6 +10291,12 @@ define amdgpu_kernel void @private_wavefront_release_monotonic_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -7818,6 +10321,12 @@ define amdgpu_kernel void @private_wavefront_release_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -7840,6 +10349,12 @@ define amdgpu_kernel void @private_wavefront_release_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7859,6 +10374,12 @@ define amdgpu_kernel void @private_wavefront_release_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -7883,6 +10404,12 @@ define amdgpu_kernel void @private_wavefront_release_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -7905,6 +10432,12 @@ define amdgpu_kernel void @private_wavefront_release_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7925,6 +10458,12 @@ define amdgpu_kernel void @private_wavefront_release_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7944,6 +10483,12 @@ define amdgpu_kernel void @private_wavefront_release_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -7964,6 +10509,12 @@ define amdgpu_kernel void @private_wavefront_release_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -7983,6 +10534,12 @@ define amdgpu_kernel void @private_wavefront_release_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: private_wavefront_release_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -7999,6 +10556,12 @@ define amdgpu_kernel void @private_wavefront_release_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: private_wavefront_release_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -8015,6 +10578,12 @@ define amdgpu_kernel void @private_wavefront_release_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: private_wavefront_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -8030,6 +10599,12 @@ define amdgpu_kernel void @private_wavefront_release_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: private_wavefront_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -8046,25 +10621,31 @@ define amdgpu_kernel void @private_wavefront_release_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -8096,6 +10677,12 @@ define amdgpu_kernel void @private_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -8120,6 +10707,12 @@ define amdgpu_kernel void @private_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -8142,6 +10735,12 @@ define amdgpu_kernel void @private_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -8161,6 +10760,12 @@ define amdgpu_kernel void @private_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -8185,6 +10790,12 @@ define amdgpu_kernel void @private_wavefront_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -8207,6 +10818,12 @@ define amdgpu_kernel void @private_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8227,6 +10844,12 @@ define amdgpu_kernel void @private_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8246,6 +10869,12 @@ define amdgpu_kernel void @private_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -8266,6 +10895,12 @@ define amdgpu_kernel void @private_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -8285,6 +10920,12 @@ define amdgpu_kernel void @private_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: private_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -8301,6 +10942,12 @@ define amdgpu_kernel void @private_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: private_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -8317,6 +10964,12 @@ define amdgpu_kernel void @private_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: private_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -8332,6 +10985,12 @@ define amdgpu_kernel void @private_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: private_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -8348,25 +11007,31 @@ define amdgpu_kernel void @private_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -8398,6 +11063,12 @@ define amdgpu_kernel void @private_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -8422,6 +11093,12 @@ define amdgpu_kernel void @private_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -8444,6 +11121,12 @@ define amdgpu_kernel void @private_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -8463,6 +11146,12 @@ define amdgpu_kernel void @private_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -8487,6 +11176,12 @@ define amdgpu_kernel void @private_wavefront_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -8509,6 +11204,12 @@ define amdgpu_kernel void @private_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8529,6 +11230,12 @@ define amdgpu_kernel void @private_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8548,6 +11255,12 @@ define amdgpu_kernel void @private_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -8568,6 +11281,12 @@ define amdgpu_kernel void @private_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -8587,6 +11306,12 @@ define amdgpu_kernel void @private_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: private_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -8603,6 +11328,12 @@ define amdgpu_kernel void @private_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: private_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -8619,6 +11350,12 @@ define amdgpu_kernel void @private_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: private_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -8634,6 +11371,12 @@ define amdgpu_kernel void @private_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: private_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -8650,25 +11393,31 @@ define amdgpu_kernel void @private_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -8700,6 +11449,12 @@ define amdgpu_kernel void @private_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -8724,6 +11479,12 @@ define amdgpu_kernel void @private_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -8746,6 +11507,12 @@ define amdgpu_kernel void @private_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -8765,6 +11532,12 @@ define amdgpu_kernel void @private_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -8789,6 +11562,12 @@ define amdgpu_kernel void @private_wavefront_monotonic_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -8811,6 +11590,12 @@ define amdgpu_kernel void @private_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8831,6 +11616,12 @@ define amdgpu_kernel void @private_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8850,6 +11641,12 @@ define amdgpu_kernel void @private_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -8870,6 +11667,12 @@ define amdgpu_kernel void @private_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -8889,6 +11692,12 @@ define amdgpu_kernel void @private_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: private_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -8905,6 +11714,12 @@ define amdgpu_kernel void @private_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: private_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -8921,6 +11736,12 @@ define amdgpu_kernel void @private_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: private_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -8936,6 +11757,12 @@ define amdgpu_kernel void @private_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: private_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -8952,25 +11779,31 @@ define amdgpu_kernel void @private_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -9002,6 +11835,12 @@ define amdgpu_kernel void @private_wavefront_acquire_acquire_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -9026,6 +11865,12 @@ define amdgpu_kernel void @private_wavefront_acquire_acquire_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -9048,6 +11893,12 @@ define amdgpu_kernel void @private_wavefront_acquire_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -9067,6 +11918,12 @@ define amdgpu_kernel void @private_wavefront_acquire_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -9091,6 +11948,12 @@ define amdgpu_kernel void @private_wavefront_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -9113,6 +11976,12 @@ define amdgpu_kernel void @private_wavefront_acquire_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9133,6 +12002,12 @@ define amdgpu_kernel void @private_wavefront_acquire_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9152,6 +12027,12 @@ define amdgpu_kernel void @private_wavefront_acquire_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -9172,6 +12053,12 @@ define amdgpu_kernel void @private_wavefront_acquire_acquire_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -9191,6 +12078,12 @@ define amdgpu_kernel void @private_wavefront_acquire_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: private_wavefront_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -9207,6 +12100,12 @@ define amdgpu_kernel void @private_wavefront_acquire_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: private_wavefront_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -9223,6 +12122,12 @@ define amdgpu_kernel void @private_wavefront_acquire_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: private_wavefront_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -9238,6 +12143,12 @@ define amdgpu_kernel void @private_wavefront_acquire_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: private_wavefront_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -9254,25 +12165,31 @@ define amdgpu_kernel void @private_wavefront_acquire_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -9304,6 +12221,12 @@ define amdgpu_kernel void @private_wavefront_release_acquire_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -9328,6 +12251,12 @@ define amdgpu_kernel void @private_wavefront_release_acquire_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -9350,6 +12279,12 @@ define amdgpu_kernel void @private_wavefront_release_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -9369,6 +12304,12 @@ define amdgpu_kernel void @private_wavefront_release_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -9393,6 +12334,12 @@ define amdgpu_kernel void @private_wavefront_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -9415,6 +12362,12 @@ define amdgpu_kernel void @private_wavefront_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9435,6 +12388,12 @@ define amdgpu_kernel void @private_wavefront_release_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9454,6 +12413,12 @@ define amdgpu_kernel void @private_wavefront_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -9474,6 +12439,12 @@ define amdgpu_kernel void @private_wavefront_release_acquire_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -9493,6 +12464,12 @@ define amdgpu_kernel void @private_wavefront_release_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: private_wavefront_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -9509,6 +12486,12 @@ define amdgpu_kernel void @private_wavefront_release_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: private_wavefront_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -9525,6 +12508,12 @@ define amdgpu_kernel void @private_wavefront_release_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: private_wavefront_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -9540,6 +12529,12 @@ define amdgpu_kernel void @private_wavefront_release_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: private_wavefront_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -9556,25 +12551,31 @@ define amdgpu_kernel void @private_wavefront_release_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -9606,6 +12607,12 @@ define amdgpu_kernel void @private_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -9630,6 +12637,12 @@ define amdgpu_kernel void @private_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -9652,6 +12665,12 @@ define amdgpu_kernel void @private_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -9671,6 +12690,12 @@ define amdgpu_kernel void @private_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -9695,6 +12720,12 @@ define amdgpu_kernel void @private_wavefront_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -9717,6 +12748,12 @@ define amdgpu_kernel void @private_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9737,6 +12774,12 @@ define amdgpu_kernel void @private_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9756,6 +12799,12 @@ define amdgpu_kernel void @private_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -9776,6 +12825,12 @@ define amdgpu_kernel void @private_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -9795,6 +12850,12 @@ define amdgpu_kernel void @private_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: private_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -9811,6 +12872,12 @@ define amdgpu_kernel void @private_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: private_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -9827,6 +12894,12 @@ define amdgpu_kernel void @private_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: private_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -9842,6 +12915,12 @@ define amdgpu_kernel void @private_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: private_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -9858,25 +12937,31 @@ define amdgpu_kernel void @private_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -9908,6 +12993,12 @@ define amdgpu_kernel void @private_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -9932,6 +13023,12 @@ define amdgpu_kernel void @private_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -9954,6 +13051,12 @@ define amdgpu_kernel void @private_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -9973,6 +13076,12 @@ define amdgpu_kernel void @private_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -9997,6 +13106,12 @@ define amdgpu_kernel void @private_wavefront_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -10019,6 +13134,12 @@ define amdgpu_kernel void @private_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10039,6 +13160,12 @@ define amdgpu_kernel void @private_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10058,6 +13185,12 @@ define amdgpu_kernel void @private_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -10078,6 +13211,12 @@ define amdgpu_kernel void @private_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -10097,6 +13236,12 @@ define amdgpu_kernel void @private_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: private_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -10113,6 +13258,12 @@ define amdgpu_kernel void @private_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: private_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -10129,6 +13280,12 @@ define amdgpu_kernel void @private_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: private_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -10144,6 +13301,12 @@ define amdgpu_kernel void @private_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: private_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -10160,25 +13323,31 @@ define amdgpu_kernel void @private_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -10210,6 +13379,12 @@ define amdgpu_kernel void @private_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -10234,6 +13409,12 @@ define amdgpu_kernel void @private_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -10256,6 +13437,12 @@ define amdgpu_kernel void @private_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10275,6 +13462,12 @@ define amdgpu_kernel void @private_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10299,6 +13492,12 @@ define amdgpu_kernel void @private_wavefront_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -10321,6 +13520,12 @@ define amdgpu_kernel void @private_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10341,6 +13546,12 @@ define amdgpu_kernel void @private_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10360,6 +13571,12 @@ define amdgpu_kernel void @private_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -10380,6 +13597,12 @@ define amdgpu_kernel void @private_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -10399,6 +13622,12 @@ define amdgpu_kernel void @private_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: private_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -10415,6 +13644,12 @@ define amdgpu_kernel void @private_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: private_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -10431,6 +13666,12 @@ define amdgpu_kernel void @private_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: private_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -10446,6 +13687,12 @@ define amdgpu_kernel void @private_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: private_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -10462,25 +13709,31 @@ define amdgpu_kernel void @private_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -10512,6 +13765,12 @@ define amdgpu_kernel void @private_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -10536,6 +13795,12 @@ define amdgpu_kernel void @private_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -10558,6 +13823,12 @@ define amdgpu_kernel void @private_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10577,6 +13848,12 @@ define amdgpu_kernel void @private_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10601,6 +13878,12 @@ define amdgpu_kernel void @private_wavefront_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -10623,6 +13906,12 @@ define amdgpu_kernel void @private_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10643,6 +13932,12 @@ define amdgpu_kernel void @private_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10662,6 +13957,12 @@ define amdgpu_kernel void @private_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -10682,6 +13983,12 @@ define amdgpu_kernel void @private_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -10701,6 +14008,12 @@ define amdgpu_kernel void @private_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: private_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -10717,6 +14030,12 @@ define amdgpu_kernel void @private_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: private_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -10733,6 +14052,12 @@ define amdgpu_kernel void @private_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: private_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -10748,6 +14073,12 @@ define amdgpu_kernel void @private_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: private_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -10764,25 +14095,31 @@ define amdgpu_kernel void @private_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -10814,6 +14151,12 @@ define amdgpu_kernel void @private_wavefront_release_seq_cst_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -10838,6 +14181,12 @@ define amdgpu_kernel void @private_wavefront_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -10860,6 +14209,12 @@ define amdgpu_kernel void @private_wavefront_release_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10879,6 +14234,12 @@ define amdgpu_kernel void @private_wavefront_release_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10903,6 +14264,12 @@ define amdgpu_kernel void @private_wavefront_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -10925,6 +14292,12 @@ define amdgpu_kernel void @private_wavefront_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10945,6 +14318,12 @@ define amdgpu_kernel void @private_wavefront_release_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10964,6 +14343,12 @@ define amdgpu_kernel void @private_wavefront_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -10984,6 +14369,12 @@ define amdgpu_kernel void @private_wavefront_release_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -11003,6 +14394,12 @@ define amdgpu_kernel void @private_wavefront_release_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: private_wavefront_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -11019,6 +14416,12 @@ define amdgpu_kernel void @private_wavefront_release_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: private_wavefront_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -11035,6 +14438,12 @@ define amdgpu_kernel void @private_wavefront_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: private_wavefront_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -11050,6 +14459,12 @@ define amdgpu_kernel void @private_wavefront_release_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: private_wavefront_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -11066,25 +14481,31 @@ define amdgpu_kernel void @private_wavefront_release_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -11116,6 +14537,12 @@ define amdgpu_kernel void @private_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -11140,6 +14567,12 @@ define amdgpu_kernel void @private_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -11162,6 +14595,12 @@ define amdgpu_kernel void @private_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11181,6 +14620,12 @@ define amdgpu_kernel void @private_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11205,6 +14650,12 @@ define amdgpu_kernel void @private_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -11227,6 +14678,12 @@ define amdgpu_kernel void @private_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11247,6 +14704,12 @@ define amdgpu_kernel void @private_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11266,6 +14729,12 @@ define amdgpu_kernel void @private_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -11286,6 +14755,12 @@ define amdgpu_kernel void @private_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -11305,6 +14780,12 @@ define amdgpu_kernel void @private_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: private_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -11321,6 +14802,12 @@ define amdgpu_kernel void @private_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: private_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -11337,6 +14824,12 @@ define amdgpu_kernel void @private_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: private_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -11352,6 +14845,12 @@ define amdgpu_kernel void @private_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: private_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -11368,25 +14867,31 @@ define amdgpu_kernel void @private_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -11418,6 +14923,12 @@ define amdgpu_kernel void @private_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -11442,6 +14953,12 @@ define amdgpu_kernel void @private_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -11464,6 +14981,12 @@ define amdgpu_kernel void @private_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11483,6 +15006,12 @@ define amdgpu_kernel void @private_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11507,6 +15036,12 @@ define amdgpu_kernel void @private_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -11529,6 +15064,12 @@ define amdgpu_kernel void @private_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11549,6 +15090,12 @@ define amdgpu_kernel void @private_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11568,6 +15115,12 @@ define amdgpu_kernel void @private_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -11588,6 +15141,12 @@ define amdgpu_kernel void @private_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -11607,6 +15166,12 @@ define amdgpu_kernel void @private_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: private_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -11623,6 +15188,12 @@ define amdgpu_kernel void @private_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: private_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -11639,6 +15210,12 @@ define amdgpu_kernel void @private_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: private_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -11654,6 +15231,12 @@ define amdgpu_kernel void @private_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: private_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -11670,25 +15253,31 @@ define amdgpu_kernel void @private_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -11718,11 +15307,15 @@ define amdgpu_kernel void @private_wavefront_one_as_unordered_load( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -11732,11 +15325,15 @@ define amdgpu_kernel void @private_wavefront_one_as_unordered_load( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -11746,11 +15343,15 @@ define amdgpu_kernel void @private_wavefront_one_as_unordered_load( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -11760,11 +15361,15 @@ define amdgpu_kernel void @private_wavefront_one_as_unordered_load( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -11778,11 +15383,15 @@ define amdgpu_kernel void @private_wavefront_one_as_unordered_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -11792,11 +15401,15 @@ define amdgpu_kernel void @private_wavefront_one_as_unordered_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -11806,11 +15419,15 @@ define amdgpu_kernel void @private_wavefront_one_as_unordered_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -11818,84 +15435,108 @@ define amdgpu_kernel void @private_wavefront_one_as_unordered_load( ; ; GFX942-NOTTGSPLIT-LABEL: private_wavefront_one_as_unordered_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_wavefront_one_as_unordered_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_wavefront_one_as_unordered_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_wavefront_one_as_unordered_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_wavefront_one_as_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_wavefront_one_as_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: private_wavefront_one_as_unordered_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -11904,6 +15545,7 @@ define amdgpu_kernel void @private_wavefront_one_as_unordered_load( ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: flat_load_b32 v0, v[0:1] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %in, ptr addrspace(5) %out) { @@ -11918,11 +15560,15 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_load( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -11932,11 +15578,15 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_load( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -11946,11 +15596,15 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_load( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -11960,11 +15614,15 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_load( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -11978,11 +15636,15 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -11992,11 +15654,15 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12006,11 +15672,15 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12018,84 +15688,108 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_load( ; ; GFX942-NOTTGSPLIT-LABEL: private_wavefront_one_as_monotonic_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_wavefront_one_as_monotonic_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_wavefront_one_as_monotonic_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_wavefront_one_as_monotonic_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_wavefront_one_as_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_wavefront_one_as_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: private_wavefront_one_as_monotonic_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -12104,6 +15798,7 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_load( ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: flat_load_b32 v0, v[0:1] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %in, ptr addrspace(5) %out) { @@ -12118,11 +15813,15 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_load( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12132,11 +15831,15 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_load( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12146,11 +15849,15 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_load( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12160,11 +15867,15 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_load( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12178,11 +15889,15 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -12192,11 +15907,15 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12206,11 +15925,15 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12218,84 +15941,108 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_load( ; ; GFX942-NOTTGSPLIT-LABEL: private_wavefront_one_as_acquire_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_wavefront_one_as_acquire_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_wavefront_one_as_acquire_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_wavefront_one_as_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_wavefront_one_as_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_wavefront_one_as_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: private_wavefront_one_as_acquire_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -12304,6 +16051,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_load( ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: flat_load_b32 v0, v[0:1] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %in, ptr addrspace(5) %out) { @@ -12318,11 +16066,15 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_load( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12332,11 +16084,15 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_load( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12346,11 +16102,15 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_load( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12360,11 +16120,15 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_load( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12378,11 +16142,15 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -12392,11 +16160,15 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12406,11 +16178,15 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12418,84 +16194,108 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_load( ; ; GFX942-NOTTGSPLIT-LABEL: private_wavefront_one_as_seq_cst_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_wavefront_one_as_seq_cst_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_wavefront_one_as_seq_cst_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_wavefront_one_as_seq_cst_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_wavefront_one_as_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_wavefront_one_as_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: private_wavefront_one_as_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -12504,6 +16304,7 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_load( ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: flat_load_b32 v0, v[0:1] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %in, ptr addrspace(5) %out) { @@ -12518,10 +16319,14 @@ define amdgpu_kernel void @private_wavefront_one_as_unordered_store( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX6-NEXT: s_endpgm @@ -12530,10 +16335,14 @@ define amdgpu_kernel void @private_wavefront_one_as_unordered_store( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX7-NEXT: s_endpgm @@ -12542,10 +16351,14 @@ define amdgpu_kernel void @private_wavefront_one_as_unordered_store( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-WGP-NEXT: s_endpgm @@ -12554,10 +16367,14 @@ define amdgpu_kernel void @private_wavefront_one_as_unordered_store( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-CU-NEXT: s_endpgm @@ -12570,10 +16387,14 @@ define amdgpu_kernel void @private_wavefront_one_as_unordered_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -12582,10 +16403,14 @@ define amdgpu_kernel void @private_wavefront_one_as_unordered_store( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -12594,65 +16419,93 @@ define amdgpu_kernel void @private_wavefront_one_as_unordered_store( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: private_wavefront_one_as_unordered_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_wavefront_one_as_unordered_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_wavefront_one_as_unordered_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_wavefront_one_as_unordered_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_wavefront_one_as_unordered_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_wavefront_one_as_unordered_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; @@ -12660,22 +16513,26 @@ define amdgpu_kernel void @private_wavefront_one_as_unordered_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -12697,10 +16554,14 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_store( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX6-NEXT: s_endpgm @@ -12709,10 +16570,14 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_store( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX7-NEXT: s_endpgm @@ -12721,10 +16586,14 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_store( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-WGP-NEXT: s_endpgm @@ -12733,10 +16602,14 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_store( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-CU-NEXT: s_endpgm @@ -12749,10 +16622,14 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -12761,10 +16638,14 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_store( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -12773,65 +16654,93 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_store( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: private_wavefront_one_as_monotonic_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_wavefront_one_as_monotonic_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_wavefront_one_as_monotonic_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_wavefront_one_as_monotonic_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_wavefront_one_as_monotonic_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_wavefront_one_as_monotonic_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; @@ -12839,22 +16748,26 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -12876,10 +16789,14 @@ define amdgpu_kernel void @private_wavefront_one_as_release_store( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX6-NEXT: s_endpgm @@ -12888,10 +16805,14 @@ define amdgpu_kernel void @private_wavefront_one_as_release_store( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX7-NEXT: s_endpgm @@ -12900,10 +16821,14 @@ define amdgpu_kernel void @private_wavefront_one_as_release_store( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-WGP-NEXT: s_endpgm @@ -12912,10 +16837,14 @@ define amdgpu_kernel void @private_wavefront_one_as_release_store( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-CU-NEXT: s_endpgm @@ -12928,10 +16857,14 @@ define amdgpu_kernel void @private_wavefront_one_as_release_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -12940,10 +16873,14 @@ define amdgpu_kernel void @private_wavefront_one_as_release_store( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -12952,65 +16889,93 @@ define amdgpu_kernel void @private_wavefront_one_as_release_store( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: private_wavefront_one_as_release_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_wavefront_one_as_release_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_wavefront_one_as_release_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_wavefront_one_as_release_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_wavefront_one_as_release_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_wavefront_one_as_release_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; @@ -13018,22 +16983,26 @@ define amdgpu_kernel void @private_wavefront_one_as_release_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -13055,10 +17024,14 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_store( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX6-NEXT: s_endpgm @@ -13067,10 +17040,14 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_store( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX7-NEXT: s_endpgm @@ -13079,10 +17056,14 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_store( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-WGP-NEXT: s_endpgm @@ -13091,10 +17072,14 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_store( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-CU-NEXT: s_endpgm @@ -13107,10 +17092,14 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -13119,10 +17108,14 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_store( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -13131,65 +17124,93 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_store( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: private_wavefront_one_as_seq_cst_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_wavefront_one_as_seq_cst_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_wavefront_one_as_seq_cst_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_wavefront_one_as_seq_cst_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_wavefront_one_as_seq_cst_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_wavefront_one_as_seq_cst_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; @@ -13197,22 +17218,26 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -13235,8 +17260,15 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13247,8 +17279,15 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13259,8 +17298,15 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13271,8 +17317,15 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13287,8 +17340,15 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -13299,8 +17359,15 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13311,8 +17378,15 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13321,8 +17395,14 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_wavefront_one_as_monotonic_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -13330,8 +17410,14 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_wavefront_one_as_monotonic_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm @@ -13339,8 +17425,14 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_atomicrmw( ; GFX11-WGP-LABEL: private_wavefront_one_as_monotonic_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm @@ -13348,8 +17440,14 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_atomicrmw( ; GFX11-CU-LABEL: private_wavefront_one_as_monotonic_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm @@ -13357,8 +17455,14 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_atomicrmw( ; GFX12-WGP-LABEL: private_wavefront_one_as_monotonic_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm @@ -13366,8 +17470,14 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_atomicrmw( ; GFX12-CU-LABEL: private_wavefront_one_as_monotonic_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm @@ -13375,29 +17485,34 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_atomicrmw( ; GFX1250-LABEL: private_wavefront_one_as_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 ; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 @@ -13414,8 +17529,15 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13426,8 +17548,15 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13438,8 +17567,15 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13450,8 +17586,15 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13466,8 +17609,15 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -13478,8 +17628,15 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13490,8 +17647,15 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13500,8 +17664,14 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_wavefront_one_as_acquire_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -13509,8 +17679,14 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_wavefront_one_as_acquire_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm @@ -13518,8 +17694,14 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_atomicrmw( ; GFX11-WGP-LABEL: private_wavefront_one_as_acquire_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm @@ -13527,8 +17709,14 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_atomicrmw( ; GFX11-CU-LABEL: private_wavefront_one_as_acquire_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm @@ -13536,8 +17724,14 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_atomicrmw( ; GFX12-WGP-LABEL: private_wavefront_one_as_acquire_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm @@ -13545,8 +17739,14 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_atomicrmw( ; GFX12-CU-LABEL: private_wavefront_one_as_acquire_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm @@ -13554,29 +17754,34 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_atomicrmw( ; GFX1250-LABEL: private_wavefront_one_as_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 ; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 @@ -13593,8 +17798,15 @@ define amdgpu_kernel void @private_wavefront_one_as_release_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13605,8 +17817,15 @@ define amdgpu_kernel void @private_wavefront_one_as_release_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13617,8 +17836,15 @@ define amdgpu_kernel void @private_wavefront_one_as_release_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13629,8 +17855,15 @@ define amdgpu_kernel void @private_wavefront_one_as_release_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13645,8 +17878,15 @@ define amdgpu_kernel void @private_wavefront_one_as_release_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -13657,8 +17897,15 @@ define amdgpu_kernel void @private_wavefront_one_as_release_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13669,8 +17916,15 @@ define amdgpu_kernel void @private_wavefront_one_as_release_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13679,8 +17933,14 @@ define amdgpu_kernel void @private_wavefront_one_as_release_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_wavefront_one_as_release_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -13688,8 +17948,14 @@ define amdgpu_kernel void @private_wavefront_one_as_release_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_wavefront_one_as_release_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm @@ -13697,8 +17963,14 @@ define amdgpu_kernel void @private_wavefront_one_as_release_atomicrmw( ; GFX11-WGP-LABEL: private_wavefront_one_as_release_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm @@ -13706,8 +17978,14 @@ define amdgpu_kernel void @private_wavefront_one_as_release_atomicrmw( ; GFX11-CU-LABEL: private_wavefront_one_as_release_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm @@ -13715,8 +17993,14 @@ define amdgpu_kernel void @private_wavefront_one_as_release_atomicrmw( ; GFX12-WGP-LABEL: private_wavefront_one_as_release_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm @@ -13724,8 +18008,14 @@ define amdgpu_kernel void @private_wavefront_one_as_release_atomicrmw( ; GFX12-CU-LABEL: private_wavefront_one_as_release_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm @@ -13733,29 +18023,34 @@ define amdgpu_kernel void @private_wavefront_one_as_release_atomicrmw( ; GFX1250-LABEL: private_wavefront_one_as_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 ; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 @@ -13772,8 +18067,15 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13784,8 +18086,15 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13796,8 +18105,15 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13808,8 +18124,15 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13824,8 +18147,15 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -13836,8 +18166,15 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13848,8 +18185,15 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13858,8 +18202,14 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_wavefront_one_as_acq_rel_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -13867,8 +18217,14 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_wavefront_one_as_acq_rel_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm @@ -13876,8 +18232,14 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_atomicrmw( ; GFX11-WGP-LABEL: private_wavefront_one_as_acq_rel_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm @@ -13885,8 +18247,14 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_atomicrmw( ; GFX11-CU-LABEL: private_wavefront_one_as_acq_rel_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm @@ -13894,8 +18262,14 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_atomicrmw( ; GFX12-WGP-LABEL: private_wavefront_one_as_acq_rel_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm @@ -13903,8 +18277,14 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_atomicrmw( ; GFX12-CU-LABEL: private_wavefront_one_as_acq_rel_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm @@ -13912,29 +18292,34 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_atomicrmw( ; GFX1250-LABEL: private_wavefront_one_as_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 ; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 @@ -13951,8 +18336,15 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13963,8 +18355,15 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13975,8 +18374,15 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13987,8 +18393,15 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -14003,8 +18416,15 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -14015,8 +18435,15 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -14027,8 +18454,15 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -14037,8 +18471,14 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_wavefront_one_as_seq_cst_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -14046,8 +18486,14 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_wavefront_one_as_seq_cst_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm @@ -14055,8 +18501,14 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_atomicrmw( ; GFX11-WGP-LABEL: private_wavefront_one_as_seq_cst_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm @@ -14064,8 +18516,14 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_atomicrmw( ; GFX11-CU-LABEL: private_wavefront_one_as_seq_cst_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm @@ -14073,8 +18531,14 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_atomicrmw( ; GFX12-WGP-LABEL: private_wavefront_one_as_seq_cst_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm @@ -14082,8 +18546,14 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_atomicrmw( ; GFX12-CU-LABEL: private_wavefront_one_as_seq_cst_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm @@ -14091,29 +18561,34 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_atomicrmw( ; GFX1250-LABEL: private_wavefront_one_as_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 ; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 @@ -14130,6 +18605,10 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_ret_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -14148,6 +18627,10 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_ret_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -14165,6 +18648,10 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_ret_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -14182,6 +18669,10 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_ret_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -14203,6 +18694,10 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -14220,6 +18715,10 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -14237,6 +18736,10 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -14252,6 +18755,10 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_wavefront_one_as_acquire_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 @@ -14264,6 +18771,10 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_wavefront_one_as_acquire_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 @@ -14276,6 +18787,10 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_ret_atomicrmw( ; GFX11-WGP-LABEL: private_wavefront_one_as_acquire_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 @@ -14288,6 +18803,10 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_ret_atomicrmw( ; GFX11-CU-LABEL: private_wavefront_one_as_acquire_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 @@ -14300,6 +18819,10 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_ret_atomicrmw( ; GFX12-WGP-LABEL: private_wavefront_one_as_acquire_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 @@ -14312,6 +18835,10 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_ret_atomicrmw( ; GFX12-CU-LABEL: private_wavefront_one_as_acquire_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 @@ -14325,22 +18852,26 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_ret_atomicrmw( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_lg_u32 s0, s2 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b32 s2, 0 ; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s0, s3 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -14366,6 +18897,10 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -14384,6 +18919,10 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -14401,6 +18940,10 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -14418,6 +18961,10 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -14439,6 +18986,10 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -14456,6 +19007,10 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -14473,6 +19028,10 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -14488,6 +19047,10 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 @@ -14500,6 +19063,10 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 @@ -14512,6 +19079,10 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX11-WGP-LABEL: private_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 @@ -14524,6 +19095,10 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX11-CU-LABEL: private_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 @@ -14536,6 +19111,10 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX12-WGP-LABEL: private_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 @@ -14548,6 +19127,10 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-LABEL: private_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 @@ -14561,22 +19144,26 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_lg_u32 s0, s2 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b32 s2, 0 ; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s0, s3 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -14602,6 +19189,10 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -14620,6 +19211,10 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -14637,6 +19232,10 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -14654,6 +19253,10 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -14675,6 +19278,10 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -14692,6 +19299,10 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -14709,6 +19320,10 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -14724,6 +19339,10 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 @@ -14736,6 +19355,10 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 @@ -14748,6 +19371,10 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX11-WGP-LABEL: private_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 @@ -14760,6 +19387,10 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX11-CU-LABEL: private_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 @@ -14772,6 +19403,10 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX12-WGP-LABEL: private_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 @@ -14784,6 +19419,10 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-LABEL: private_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 @@ -14797,22 +19436,26 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_lg_u32 s0, s2 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b32 s2, 0 ; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s0, s3 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -14838,6 +19481,12 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_monotonic_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -14858,6 +19507,12 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_monotonic_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -14878,6 +19533,12 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -14895,6 +19556,12 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -14916,6 +19583,12 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -14936,6 +19609,12 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14954,6 +19633,12 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14970,6 +19655,12 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -14987,6 +19678,12 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -15004,6 +19701,12 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_monotonic_cmpxchg( ; GFX11-WGP-LABEL: private_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -15019,6 +19722,12 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_monotonic_cmpxchg( ; GFX11-CU-LABEL: private_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -15034,6 +19743,12 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_monotonic_cmpxchg( ; GFX12-WGP-LABEL: private_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -15048,6 +19763,12 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-LABEL: private_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -15062,26 +19783,32 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_monotonic_cmpxchg( ; GFX1250-LABEL: private_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -15089,6 +19816,7 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_monotonic_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -15108,6 +19836,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -15128,6 +19862,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -15148,6 +19888,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -15165,6 +19911,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -15186,6 +19938,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -15206,6 +19964,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15224,6 +19988,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15240,6 +20010,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -15257,6 +20033,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -15274,6 +20056,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX11-WGP-LABEL: private_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -15289,6 +20077,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX11-CU-LABEL: private_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -15304,6 +20098,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX12-WGP-LABEL: private_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -15318,6 +20118,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-LABEL: private_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -15332,26 +20138,32 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX1250-LABEL: private_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -15359,6 +20171,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -15378,6 +20191,12 @@ define amdgpu_kernel void @private_wavefront_one_as_release_monotonic_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -15398,6 +20217,12 @@ define amdgpu_kernel void @private_wavefront_one_as_release_monotonic_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -15418,6 +20243,12 @@ define amdgpu_kernel void @private_wavefront_one_as_release_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -15435,6 +20266,12 @@ define amdgpu_kernel void @private_wavefront_one_as_release_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -15456,6 +20293,12 @@ define amdgpu_kernel void @private_wavefront_one_as_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -15476,6 +20319,12 @@ define amdgpu_kernel void @private_wavefront_one_as_release_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15494,6 +20343,12 @@ define amdgpu_kernel void @private_wavefront_one_as_release_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15510,6 +20365,12 @@ define amdgpu_kernel void @private_wavefront_one_as_release_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_wavefront_one_as_release_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -15527,6 +20388,12 @@ define amdgpu_kernel void @private_wavefront_one_as_release_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_wavefront_one_as_release_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -15544,6 +20411,12 @@ define amdgpu_kernel void @private_wavefront_one_as_release_monotonic_cmpxchg( ; GFX11-WGP-LABEL: private_wavefront_one_as_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -15559,6 +20432,12 @@ define amdgpu_kernel void @private_wavefront_one_as_release_monotonic_cmpxchg( ; GFX11-CU-LABEL: private_wavefront_one_as_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -15574,6 +20453,12 @@ define amdgpu_kernel void @private_wavefront_one_as_release_monotonic_cmpxchg( ; GFX12-WGP-LABEL: private_wavefront_one_as_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -15588,6 +20473,12 @@ define amdgpu_kernel void @private_wavefront_one_as_release_monotonic_cmpxchg( ; GFX12-CU-LABEL: private_wavefront_one_as_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -15602,26 +20493,32 @@ define amdgpu_kernel void @private_wavefront_one_as_release_monotonic_cmpxchg( ; GFX1250-LABEL: private_wavefront_one_as_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -15629,6 +20526,7 @@ define amdgpu_kernel void @private_wavefront_one_as_release_monotonic_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -15648,6 +20546,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -15668,6 +20572,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -15688,6 +20598,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -15705,6 +20621,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -15726,6 +20648,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -15746,6 +20674,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15764,6 +20698,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15780,6 +20720,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -15797,6 +20743,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -15814,6 +20766,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX11-WGP-LABEL: private_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -15829,6 +20787,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX11-CU-LABEL: private_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -15844,6 +20808,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-WGP-LABEL: private_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -15858,6 +20828,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-LABEL: private_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -15872,26 +20848,32 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX1250-LABEL: private_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -15899,6 +20881,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -15918,6 +20901,12 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -15938,6 +20927,12 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -15958,6 +20953,12 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -15975,6 +20976,12 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -15996,6 +21003,12 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -16016,6 +21029,12 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16034,6 +21053,12 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16050,6 +21075,12 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -16067,6 +21098,12 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -16084,6 +21121,12 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX11-WGP-LABEL: private_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -16099,6 +21142,12 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX11-CU-LABEL: private_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -16114,6 +21163,12 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-WGP-LABEL: private_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -16128,6 +21183,12 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-LABEL: private_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -16142,26 +21203,32 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX1250-LABEL: private_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -16169,6 +21236,7 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -16188,6 +21256,12 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -16208,6 +21282,12 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -16228,6 +21308,12 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16245,6 +21331,12 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16266,6 +21358,12 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -16286,6 +21384,12 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16304,6 +21408,12 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16320,6 +21430,12 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -16337,6 +21453,12 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -16354,6 +21476,12 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX11-WGP-LABEL: private_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -16369,6 +21497,12 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX11-CU-LABEL: private_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -16384,6 +21518,12 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX12-WGP-LABEL: private_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -16398,6 +21538,12 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-LABEL: private_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -16412,26 +21558,32 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX1250-LABEL: private_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -16439,6 +21591,7 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -16458,6 +21611,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -16478,6 +21637,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -16498,6 +21663,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16515,6 +21686,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16536,6 +21713,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -16556,6 +21739,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16574,6 +21763,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16590,6 +21785,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -16607,6 +21808,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -16624,6 +21831,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX11-WGP-LABEL: private_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -16639,6 +21852,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX11-CU-LABEL: private_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -16654,6 +21873,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX12-WGP-LABEL: private_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -16668,6 +21893,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-LABEL: private_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -16682,26 +21913,32 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX1250-LABEL: private_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -16709,6 +21946,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -16728,6 +21966,12 @@ define amdgpu_kernel void @private_wavefront_one_as_release_acquire_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -16748,6 +21992,12 @@ define amdgpu_kernel void @private_wavefront_one_as_release_acquire_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -16768,6 +22018,12 @@ define amdgpu_kernel void @private_wavefront_one_as_release_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16785,6 +22041,12 @@ define amdgpu_kernel void @private_wavefront_one_as_release_acquire_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16806,6 +22068,12 @@ define amdgpu_kernel void @private_wavefront_one_as_release_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -16826,6 +22094,12 @@ define amdgpu_kernel void @private_wavefront_one_as_release_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16844,6 +22118,12 @@ define amdgpu_kernel void @private_wavefront_one_as_release_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16860,6 +22140,12 @@ define amdgpu_kernel void @private_wavefront_one_as_release_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_wavefront_one_as_release_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -16877,6 +22163,12 @@ define amdgpu_kernel void @private_wavefront_one_as_release_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_wavefront_one_as_release_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -16894,6 +22186,12 @@ define amdgpu_kernel void @private_wavefront_one_as_release_acquire_cmpxchg( ; GFX11-WGP-LABEL: private_wavefront_one_as_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -16909,6 +22207,12 @@ define amdgpu_kernel void @private_wavefront_one_as_release_acquire_cmpxchg( ; GFX11-CU-LABEL: private_wavefront_one_as_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -16924,6 +22228,12 @@ define amdgpu_kernel void @private_wavefront_one_as_release_acquire_cmpxchg( ; GFX12-WGP-LABEL: private_wavefront_one_as_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -16938,6 +22248,12 @@ define amdgpu_kernel void @private_wavefront_one_as_release_acquire_cmpxchg( ; GFX12-CU-LABEL: private_wavefront_one_as_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -16952,26 +22268,32 @@ define amdgpu_kernel void @private_wavefront_one_as_release_acquire_cmpxchg( ; GFX1250-LABEL: private_wavefront_one_as_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -16979,6 +22301,7 @@ define amdgpu_kernel void @private_wavefront_one_as_release_acquire_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -16998,6 +22321,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -17018,6 +22347,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -17038,6 +22373,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17055,6 +22396,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17076,6 +22423,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -17096,6 +22449,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17114,6 +22473,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17130,6 +22495,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -17147,6 +22518,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -17164,6 +22541,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX11-WGP-LABEL: private_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -17179,6 +22562,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX11-CU-LABEL: private_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -17194,6 +22583,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX12-WGP-LABEL: private_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -17208,6 +22603,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-LABEL: private_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -17222,26 +22623,32 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX1250-LABEL: private_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -17249,6 +22656,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -17268,6 +22676,12 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -17288,6 +22702,12 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -17308,6 +22728,12 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17325,6 +22751,12 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17346,6 +22778,12 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -17366,6 +22804,12 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17384,6 +22828,12 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17400,6 +22850,12 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -17417,6 +22873,12 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -17434,6 +22896,12 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX11-WGP-LABEL: private_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -17449,6 +22917,12 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX11-CU-LABEL: private_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -17464,6 +22938,12 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX12-WGP-LABEL: private_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -17478,6 +22958,12 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-LABEL: private_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -17492,26 +22978,32 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX1250-LABEL: private_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -17519,6 +23011,7 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -17538,6 +23031,12 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -17558,6 +23057,12 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -17578,6 +23083,12 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17595,6 +23106,12 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17616,6 +23133,12 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -17636,6 +23159,12 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17654,6 +23183,12 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17670,6 +23205,12 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -17687,6 +23228,12 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -17704,6 +23251,12 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: private_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -17719,6 +23272,12 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX11-CU-LABEL: private_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -17734,6 +23293,12 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: private_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -17748,6 +23313,12 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-LABEL: private_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -17762,26 +23333,32 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX1250-LABEL: private_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -17789,6 +23366,7 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -17808,6 +23386,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -17828,6 +23412,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -17848,6 +23438,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17865,6 +23461,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17886,6 +23488,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -17906,6 +23514,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17924,6 +23538,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17940,6 +23560,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -17957,6 +23583,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -17974,6 +23606,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: private_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -17989,6 +23627,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX11-CU-LABEL: private_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -18004,6 +23648,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: private_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -18018,6 +23668,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-LABEL: private_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -18032,26 +23688,32 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX1250-LABEL: private_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -18059,6 +23721,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -18078,6 +23741,12 @@ define amdgpu_kernel void @private_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -18098,6 +23767,12 @@ define amdgpu_kernel void @private_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -18118,6 +23793,12 @@ define amdgpu_kernel void @private_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -18135,6 +23816,12 @@ define amdgpu_kernel void @private_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -18156,6 +23843,12 @@ define amdgpu_kernel void @private_wavefront_one_as_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -18176,6 +23869,12 @@ define amdgpu_kernel void @private_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18194,6 +23893,12 @@ define amdgpu_kernel void @private_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18210,6 +23915,12 @@ define amdgpu_kernel void @private_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -18227,6 +23938,12 @@ define amdgpu_kernel void @private_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -18244,6 +23961,12 @@ define amdgpu_kernel void @private_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: private_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -18259,6 +23982,12 @@ define amdgpu_kernel void @private_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX11-CU-LABEL: private_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -18274,6 +24003,12 @@ define amdgpu_kernel void @private_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: private_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -18288,6 +24023,12 @@ define amdgpu_kernel void @private_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-LABEL: private_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -18302,26 +24043,32 @@ define amdgpu_kernel void @private_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX1250-LABEL: private_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -18329,6 +24076,7 @@ define amdgpu_kernel void @private_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -18348,6 +24096,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -18368,6 +24122,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -18388,6 +24148,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -18405,6 +24171,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -18426,6 +24198,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -18446,6 +24224,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18464,6 +24248,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18480,6 +24270,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -18497,6 +24293,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -18514,6 +24316,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: private_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -18529,6 +24337,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX11-CU-LABEL: private_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -18544,6 +24358,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: private_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -18558,6 +24378,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-LABEL: private_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -18572,26 +24398,32 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX1250-LABEL: private_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -18599,6 +24431,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -18618,6 +24451,12 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -18638,6 +24477,12 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -18658,6 +24503,12 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -18675,6 +24526,12 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -18696,6 +24553,12 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -18716,6 +24579,12 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18734,6 +24603,12 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18750,6 +24625,12 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -18767,6 +24648,12 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -18784,6 +24671,12 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: private_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -18799,6 +24692,12 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX11-CU-LABEL: private_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -18814,6 +24713,12 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: private_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -18828,6 +24733,12 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-LABEL: private_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -18842,26 +24753,32 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX1250-LABEL: private_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -18869,6 +24786,7 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -18889,6 +24807,12 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_monotonic_ret_cmpx ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -18913,6 +24837,12 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_monotonic_ret_cmpx ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -18935,6 +24865,12 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_monotonic_ret_cmpx ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -18954,6 +24890,12 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_monotonic_ret_cmpx ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -18978,6 +24920,12 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_monotonic_ret_cmpx ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -19000,6 +24948,12 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_monotonic_ret_cmpx ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19020,6 +24974,12 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_monotonic_ret_cmpx ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19039,6 +24999,12 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_monotonic_ret_cmpx ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -19059,6 +25025,12 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_monotonic_ret_cmpx ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -19078,6 +25050,12 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_monotonic_ret_cmpx ; GFX11-WGP-LABEL: private_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -19094,6 +25072,12 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_monotonic_ret_cmpx ; GFX11-CU-LABEL: private_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -19110,6 +25094,12 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_monotonic_ret_cmpx ; GFX12-WGP-LABEL: private_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -19125,6 +25115,12 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_monotonic_ret_cmpx ; GFX12-CU-LABEL: private_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -19141,25 +25137,31 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_monotonic_ret_cmpx ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -19191,6 +25193,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_monotonic_ret_cmpxch ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -19215,6 +25223,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_monotonic_ret_cmpxch ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -19237,6 +25251,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_monotonic_ret_cmpxch ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -19256,6 +25276,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_monotonic_ret_cmpxch ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -19280,6 +25306,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_monotonic_ret_cmpxch ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -19302,6 +25334,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_monotonic_ret_cmpxch ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19322,6 +25360,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_monotonic_ret_cmpxch ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19341,6 +25385,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_monotonic_ret_cmpxch ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -19361,6 +25411,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_monotonic_ret_cmpxch ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -19380,6 +25436,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_monotonic_ret_cmpxch ; GFX11-WGP-LABEL: private_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -19396,6 +25458,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_monotonic_ret_cmpxch ; GFX11-CU-LABEL: private_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -19412,6 +25480,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_monotonic_ret_cmpxch ; GFX12-WGP-LABEL: private_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -19427,6 +25501,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_monotonic_ret_cmpxch ; GFX12-CU-LABEL: private_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -19443,25 +25523,31 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_monotonic_ret_cmpxch ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -19493,6 +25579,12 @@ define amdgpu_kernel void @private_wavefront_one_as_release_monotonic_ret_cmpxch ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -19517,6 +25609,12 @@ define amdgpu_kernel void @private_wavefront_one_as_release_monotonic_ret_cmpxch ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -19539,6 +25637,12 @@ define amdgpu_kernel void @private_wavefront_one_as_release_monotonic_ret_cmpxch ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -19558,6 +25662,12 @@ define amdgpu_kernel void @private_wavefront_one_as_release_monotonic_ret_cmpxch ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -19582,6 +25692,12 @@ define amdgpu_kernel void @private_wavefront_one_as_release_monotonic_ret_cmpxch ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -19604,6 +25720,12 @@ define amdgpu_kernel void @private_wavefront_one_as_release_monotonic_ret_cmpxch ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19624,6 +25746,12 @@ define amdgpu_kernel void @private_wavefront_one_as_release_monotonic_ret_cmpxch ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19643,6 +25771,12 @@ define amdgpu_kernel void @private_wavefront_one_as_release_monotonic_ret_cmpxch ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -19663,6 +25797,12 @@ define amdgpu_kernel void @private_wavefront_one_as_release_monotonic_ret_cmpxch ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -19682,6 +25822,12 @@ define amdgpu_kernel void @private_wavefront_one_as_release_monotonic_ret_cmpxch ; GFX11-WGP-LABEL: private_wavefront_one_as_release_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -19698,6 +25844,12 @@ define amdgpu_kernel void @private_wavefront_one_as_release_monotonic_ret_cmpxch ; GFX11-CU-LABEL: private_wavefront_one_as_release_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -19714,6 +25866,12 @@ define amdgpu_kernel void @private_wavefront_one_as_release_monotonic_ret_cmpxch ; GFX12-WGP-LABEL: private_wavefront_one_as_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -19729,6 +25887,12 @@ define amdgpu_kernel void @private_wavefront_one_as_release_monotonic_ret_cmpxch ; GFX12-CU-LABEL: private_wavefront_one_as_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -19745,25 +25909,31 @@ define amdgpu_kernel void @private_wavefront_one_as_release_monotonic_ret_cmpxch ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -19795,6 +25965,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_monotonic_ret_cmpxch ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -19819,6 +25995,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_monotonic_ret_cmpxch ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -19841,6 +26023,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_monotonic_ret_cmpxch ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -19860,6 +26048,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_monotonic_ret_cmpxch ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -19884,6 +26078,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_monotonic_ret_cmpxch ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -19906,6 +26106,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_monotonic_ret_cmpxch ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19926,6 +26132,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_monotonic_ret_cmpxch ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19945,6 +26157,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_monotonic_ret_cmpxch ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -19965,6 +26183,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_monotonic_ret_cmpxch ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -19984,6 +26208,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_monotonic_ret_cmpxch ; GFX11-WGP-LABEL: private_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -20000,6 +26230,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_monotonic_ret_cmpxch ; GFX11-CU-LABEL: private_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -20016,6 +26252,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_monotonic_ret_cmpxch ; GFX12-WGP-LABEL: private_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -20031,6 +26273,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_monotonic_ret_cmpxch ; GFX12-CU-LABEL: private_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -20047,25 +26295,31 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_monotonic_ret_cmpxch ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -20097,6 +26351,12 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_monotonic_ret_cmpxch ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -20121,6 +26381,12 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_monotonic_ret_cmpxch ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -20143,6 +26409,12 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_monotonic_ret_cmpxch ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -20162,6 +26434,12 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_monotonic_ret_cmpxch ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -20186,6 +26464,12 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_monotonic_ret_cmpxch ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -20208,6 +26492,12 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_monotonic_ret_cmpxch ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20228,6 +26518,12 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_monotonic_ret_cmpxch ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20247,6 +26543,12 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_monotonic_ret_cmpxch ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -20267,6 +26569,12 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_monotonic_ret_cmpxch ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -20286,6 +26594,12 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_monotonic_ret_cmpxch ; GFX11-WGP-LABEL: private_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -20302,6 +26616,12 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_monotonic_ret_cmpxch ; GFX11-CU-LABEL: private_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -20318,6 +26638,12 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_monotonic_ret_cmpxch ; GFX12-WGP-LABEL: private_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -20333,6 +26659,12 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_monotonic_ret_cmpxch ; GFX12-CU-LABEL: private_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -20349,25 +26681,31 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_monotonic_ret_cmpxch ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -20399,6 +26737,12 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_acquire_ret_cmpxch ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -20423,6 +26767,12 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_acquire_ret_cmpxch ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -20445,6 +26795,12 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_acquire_ret_cmpxch ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -20464,6 +26820,12 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_acquire_ret_cmpxch ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -20488,6 +26850,12 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_acquire_ret_cmpxch ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -20510,6 +26878,12 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_acquire_ret_cmpxch ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20530,6 +26904,12 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_acquire_ret_cmpxch ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20549,6 +26929,12 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_acquire_ret_cmpxch ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -20569,6 +26955,12 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_acquire_ret_cmpxch ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -20588,6 +26980,12 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_acquire_ret_cmpxch ; GFX11-WGP-LABEL: private_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -20604,6 +27002,12 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_acquire_ret_cmpxch ; GFX11-CU-LABEL: private_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -20620,6 +27024,12 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_acquire_ret_cmpxch ; GFX12-WGP-LABEL: private_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -20635,6 +27045,12 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_acquire_ret_cmpxch ; GFX12-CU-LABEL: private_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -20651,25 +27067,31 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_acquire_ret_cmpxch ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -20701,6 +27123,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -20725,6 +27153,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -20747,6 +27181,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -20766,6 +27206,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -20790,6 +27236,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -20812,6 +27264,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20832,6 +27290,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20851,6 +27315,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -20871,6 +27341,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -20890,6 +27366,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: private_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -20906,6 +27388,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: private_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -20922,6 +27410,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: private_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -20937,6 +27431,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: private_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -20953,25 +27453,31 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -21003,6 +27509,12 @@ define amdgpu_kernel void @private_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -21027,6 +27539,12 @@ define amdgpu_kernel void @private_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -21049,6 +27567,12 @@ define amdgpu_kernel void @private_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -21068,6 +27592,12 @@ define amdgpu_kernel void @private_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -21092,6 +27622,12 @@ define amdgpu_kernel void @private_wavefront_one_as_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -21114,6 +27650,12 @@ define amdgpu_kernel void @private_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21134,6 +27676,12 @@ define amdgpu_kernel void @private_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21153,6 +27701,12 @@ define amdgpu_kernel void @private_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -21173,6 +27727,12 @@ define amdgpu_kernel void @private_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -21192,6 +27752,12 @@ define amdgpu_kernel void @private_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: private_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -21208,6 +27774,12 @@ define amdgpu_kernel void @private_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: private_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -21224,6 +27796,12 @@ define amdgpu_kernel void @private_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: private_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -21239,6 +27817,12 @@ define amdgpu_kernel void @private_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: private_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -21255,25 +27839,31 @@ define amdgpu_kernel void @private_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -21305,6 +27895,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -21329,6 +27925,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -21351,6 +27953,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -21370,6 +27978,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -21394,6 +28008,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -21416,6 +28036,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21436,6 +28062,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21455,6 +28087,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -21475,6 +28113,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -21494,6 +28138,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: private_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -21510,6 +28160,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: private_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -21526,6 +28182,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: private_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -21541,6 +28203,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: private_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -21557,25 +28225,31 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -21607,6 +28281,12 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -21631,6 +28311,12 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -21653,6 +28339,12 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -21672,6 +28364,12 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -21696,6 +28394,12 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -21718,6 +28422,12 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21738,6 +28448,12 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21757,6 +28473,12 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -21777,6 +28499,12 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -21796,6 +28524,12 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: private_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -21812,6 +28546,12 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: private_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -21828,6 +28568,12 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: private_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -21843,6 +28589,12 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: private_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -21859,25 +28611,31 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -21909,6 +28667,12 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_seq_cst_ret_cmpxch ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -21933,6 +28697,12 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_seq_cst_ret_cmpxch ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -21955,6 +28725,12 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_seq_cst_ret_cmpxch ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -21974,6 +28750,12 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_seq_cst_ret_cmpxch ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -21998,6 +28780,12 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_seq_cst_ret_cmpxch ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -22020,6 +28808,12 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_seq_cst_ret_cmpxch ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -22040,6 +28834,12 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_seq_cst_ret_cmpxch ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -22059,6 +28859,12 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_seq_cst_ret_cmpxch ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -22079,6 +28885,12 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_seq_cst_ret_cmpxch ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -22098,6 +28910,12 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_seq_cst_ret_cmpxch ; GFX11-WGP-LABEL: private_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -22114,6 +28932,12 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_seq_cst_ret_cmpxch ; GFX11-CU-LABEL: private_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -22130,6 +28954,12 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_seq_cst_ret_cmpxch ; GFX12-WGP-LABEL: private_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -22145,6 +28975,12 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_seq_cst_ret_cmpxch ; GFX12-CU-LABEL: private_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -22161,25 +28997,31 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_seq_cst_ret_cmpxch ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -22211,6 +29053,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -22235,6 +29083,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -22257,6 +29111,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -22276,6 +29136,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -22300,6 +29166,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -22322,6 +29194,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -22342,6 +29220,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -22361,6 +29245,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -22381,6 +29271,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -22400,6 +29296,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: private_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -22416,6 +29318,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: private_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -22432,6 +29340,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: private_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -22447,6 +29361,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: private_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -22463,25 +29383,31 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -22513,6 +29439,12 @@ define amdgpu_kernel void @private_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -22537,6 +29469,12 @@ define amdgpu_kernel void @private_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -22559,6 +29497,12 @@ define amdgpu_kernel void @private_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -22578,6 +29522,12 @@ define amdgpu_kernel void @private_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -22602,6 +29552,12 @@ define amdgpu_kernel void @private_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -22624,6 +29580,12 @@ define amdgpu_kernel void @private_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -22644,6 +29606,12 @@ define amdgpu_kernel void @private_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -22663,6 +29631,12 @@ define amdgpu_kernel void @private_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -22683,6 +29657,12 @@ define amdgpu_kernel void @private_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -22702,6 +29682,12 @@ define amdgpu_kernel void @private_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: private_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -22718,6 +29704,12 @@ define amdgpu_kernel void @private_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: private_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -22734,6 +29726,12 @@ define amdgpu_kernel void @private_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: private_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -22749,6 +29747,12 @@ define amdgpu_kernel void @private_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: private_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -22765,25 +29769,31 @@ define amdgpu_kernel void @private_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -22815,6 +29825,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -22839,6 +29855,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -22861,6 +29883,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -22880,6 +29908,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -22904,6 +29938,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -22926,6 +29966,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -22946,6 +29992,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -22965,6 +30017,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -22985,6 +30043,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -23004,6 +30068,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: private_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -23020,6 +30090,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: private_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -23036,6 +30112,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: private_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -23051,6 +30133,12 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: private_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -23067,25 +30155,31 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -23117,6 +30211,12 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -23141,6 +30241,12 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -23163,6 +30269,12 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -23182,6 +30294,12 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -23206,6 +30324,12 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -23228,6 +30352,12 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -23248,6 +30378,12 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -23267,6 +30403,12 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -23287,6 +30429,12 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -23306,6 +30454,12 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: private_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -23322,6 +30476,12 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: private_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -23338,6 +30498,12 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: private_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -23353,6 +30519,12 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: private_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -23369,25 +30541,31 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-workgroup.ll index c3b2d44cfae2..d50c8a875a20 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-workgroup.ll @@ -19,11 +19,15 @@ define amdgpu_kernel void @private_workgroup_unordered_load( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -33,11 +37,15 @@ define amdgpu_kernel void @private_workgroup_unordered_load( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -47,11 +55,15 @@ define amdgpu_kernel void @private_workgroup_unordered_load( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -61,11 +73,15 @@ define amdgpu_kernel void @private_workgroup_unordered_load( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -79,11 +95,15 @@ define amdgpu_kernel void @private_workgroup_unordered_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -93,11 +113,15 @@ define amdgpu_kernel void @private_workgroup_unordered_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -107,11 +131,15 @@ define amdgpu_kernel void @private_workgroup_unordered_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -119,84 +147,108 @@ define amdgpu_kernel void @private_workgroup_unordered_load( ; ; GFX942-NOTTGSPLIT-LABEL: private_workgroup_unordered_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_workgroup_unordered_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_workgroup_unordered_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_workgroup_unordered_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_workgroup_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_workgroup_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: private_workgroup_unordered_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -205,6 +257,7 @@ define amdgpu_kernel void @private_workgroup_unordered_load( ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: flat_load_b32 v0, v[0:1] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %in, ptr addrspace(5) %out) { @@ -219,11 +272,15 @@ define amdgpu_kernel void @private_workgroup_monotonic_load( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -233,11 +290,15 @@ define amdgpu_kernel void @private_workgroup_monotonic_load( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -247,11 +308,15 @@ define amdgpu_kernel void @private_workgroup_monotonic_load( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -261,11 +326,15 @@ define amdgpu_kernel void @private_workgroup_monotonic_load( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -279,11 +348,15 @@ define amdgpu_kernel void @private_workgroup_monotonic_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -293,11 +366,15 @@ define amdgpu_kernel void @private_workgroup_monotonic_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -307,11 +384,15 @@ define amdgpu_kernel void @private_workgroup_monotonic_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -319,84 +400,108 @@ define amdgpu_kernel void @private_workgroup_monotonic_load( ; ; GFX942-NOTTGSPLIT-LABEL: private_workgroup_monotonic_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_workgroup_monotonic_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_workgroup_monotonic_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_workgroup_monotonic_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_workgroup_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_workgroup_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: private_workgroup_monotonic_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -405,6 +510,7 @@ define amdgpu_kernel void @private_workgroup_monotonic_load( ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: flat_load_b32 v0, v[0:1] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %in, ptr addrspace(5) %out) { @@ -419,11 +525,15 @@ define amdgpu_kernel void @private_workgroup_acquire_load( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -433,11 +543,15 @@ define amdgpu_kernel void @private_workgroup_acquire_load( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -447,11 +561,15 @@ define amdgpu_kernel void @private_workgroup_acquire_load( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -461,11 +579,15 @@ define amdgpu_kernel void @private_workgroup_acquire_load( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -479,11 +601,15 @@ define amdgpu_kernel void @private_workgroup_acquire_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -493,11 +619,15 @@ define amdgpu_kernel void @private_workgroup_acquire_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -507,11 +637,15 @@ define amdgpu_kernel void @private_workgroup_acquire_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -519,84 +653,108 @@ define amdgpu_kernel void @private_workgroup_acquire_load( ; ; GFX942-NOTTGSPLIT-LABEL: private_workgroup_acquire_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_workgroup_acquire_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_workgroup_acquire_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_workgroup_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_workgroup_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_workgroup_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: private_workgroup_acquire_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -605,6 +763,7 @@ define amdgpu_kernel void @private_workgroup_acquire_load( ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: flat_load_b32 v0, v[0:1] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %in, ptr addrspace(5) %out) { @@ -619,11 +778,15 @@ define amdgpu_kernel void @private_workgroup_seq_cst_load( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -633,11 +796,15 @@ define amdgpu_kernel void @private_workgroup_seq_cst_load( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -647,11 +814,15 @@ define amdgpu_kernel void @private_workgroup_seq_cst_load( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -661,11 +832,15 @@ define amdgpu_kernel void @private_workgroup_seq_cst_load( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -679,11 +854,15 @@ define amdgpu_kernel void @private_workgroup_seq_cst_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -693,11 +872,15 @@ define amdgpu_kernel void @private_workgroup_seq_cst_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -707,11 +890,15 @@ define amdgpu_kernel void @private_workgroup_seq_cst_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -719,84 +906,108 @@ define amdgpu_kernel void @private_workgroup_seq_cst_load( ; ; GFX942-NOTTGSPLIT-LABEL: private_workgroup_seq_cst_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_workgroup_seq_cst_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_workgroup_seq_cst_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_workgroup_seq_cst_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_workgroup_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_workgroup_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: private_workgroup_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -807,6 +1018,7 @@ define amdgpu_kernel void @private_workgroup_seq_cst_load( ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_load_b32 v0, v[0:1] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %in, ptr addrspace(5) %out) { @@ -821,10 +1033,14 @@ define amdgpu_kernel void @private_workgroup_unordered_store( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX6-NEXT: s_endpgm @@ -833,10 +1049,14 @@ define amdgpu_kernel void @private_workgroup_unordered_store( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX7-NEXT: s_endpgm @@ -845,10 +1065,14 @@ define amdgpu_kernel void @private_workgroup_unordered_store( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-WGP-NEXT: s_endpgm @@ -857,10 +1081,14 @@ define amdgpu_kernel void @private_workgroup_unordered_store( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-CU-NEXT: s_endpgm @@ -873,10 +1101,14 @@ define amdgpu_kernel void @private_workgroup_unordered_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -885,10 +1117,14 @@ define amdgpu_kernel void @private_workgroup_unordered_store( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -897,65 +1133,93 @@ define amdgpu_kernel void @private_workgroup_unordered_store( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: private_workgroup_unordered_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_workgroup_unordered_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_workgroup_unordered_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_workgroup_unordered_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_workgroup_unordered_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_workgroup_unordered_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; @@ -963,22 +1227,26 @@ define amdgpu_kernel void @private_workgroup_unordered_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -1000,10 +1268,14 @@ define amdgpu_kernel void @private_workgroup_monotonic_store( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX6-NEXT: s_endpgm @@ -1012,10 +1284,14 @@ define amdgpu_kernel void @private_workgroup_monotonic_store( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX7-NEXT: s_endpgm @@ -1024,10 +1300,14 @@ define amdgpu_kernel void @private_workgroup_monotonic_store( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-WGP-NEXT: s_endpgm @@ -1036,10 +1316,14 @@ define amdgpu_kernel void @private_workgroup_monotonic_store( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-CU-NEXT: s_endpgm @@ -1052,10 +1336,14 @@ define amdgpu_kernel void @private_workgroup_monotonic_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -1064,10 +1352,14 @@ define amdgpu_kernel void @private_workgroup_monotonic_store( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -1076,65 +1368,93 @@ define amdgpu_kernel void @private_workgroup_monotonic_store( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: private_workgroup_monotonic_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_workgroup_monotonic_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_workgroup_monotonic_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_workgroup_monotonic_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_workgroup_monotonic_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_workgroup_monotonic_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; @@ -1142,22 +1462,26 @@ define amdgpu_kernel void @private_workgroup_monotonic_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -1179,10 +1503,14 @@ define amdgpu_kernel void @private_workgroup_release_store( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX6-NEXT: s_endpgm @@ -1191,10 +1519,14 @@ define amdgpu_kernel void @private_workgroup_release_store( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX7-NEXT: s_endpgm @@ -1203,10 +1535,14 @@ define amdgpu_kernel void @private_workgroup_release_store( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-WGP-NEXT: s_endpgm @@ -1215,10 +1551,14 @@ define amdgpu_kernel void @private_workgroup_release_store( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-CU-NEXT: s_endpgm @@ -1231,10 +1571,14 @@ define amdgpu_kernel void @private_workgroup_release_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -1243,10 +1587,14 @@ define amdgpu_kernel void @private_workgroup_release_store( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -1255,65 +1603,93 @@ define amdgpu_kernel void @private_workgroup_release_store( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: private_workgroup_release_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_workgroup_release_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_workgroup_release_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_workgroup_release_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_workgroup_release_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_workgroup_release_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; @@ -1321,22 +1697,26 @@ define amdgpu_kernel void @private_workgroup_release_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -1360,10 +1740,14 @@ define amdgpu_kernel void @private_workgroup_seq_cst_store( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX6-NEXT: s_endpgm @@ -1372,10 +1756,14 @@ define amdgpu_kernel void @private_workgroup_seq_cst_store( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX7-NEXT: s_endpgm @@ -1384,10 +1772,14 @@ define amdgpu_kernel void @private_workgroup_seq_cst_store( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-WGP-NEXT: s_endpgm @@ -1396,10 +1788,14 @@ define amdgpu_kernel void @private_workgroup_seq_cst_store( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-CU-NEXT: s_endpgm @@ -1412,10 +1808,14 @@ define amdgpu_kernel void @private_workgroup_seq_cst_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -1424,10 +1824,14 @@ define amdgpu_kernel void @private_workgroup_seq_cst_store( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -1436,65 +1840,93 @@ define amdgpu_kernel void @private_workgroup_seq_cst_store( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: private_workgroup_seq_cst_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_workgroup_seq_cst_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_workgroup_seq_cst_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_workgroup_seq_cst_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_workgroup_seq_cst_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_workgroup_seq_cst_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; @@ -1502,22 +1934,26 @@ define amdgpu_kernel void @private_workgroup_seq_cst_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -1542,8 +1978,15 @@ define amdgpu_kernel void @private_workgroup_monotonic_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1554,8 +1997,15 @@ define amdgpu_kernel void @private_workgroup_monotonic_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1566,8 +2016,15 @@ define amdgpu_kernel void @private_workgroup_monotonic_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1578,8 +2035,15 @@ define amdgpu_kernel void @private_workgroup_monotonic_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1594,8 +2058,15 @@ define amdgpu_kernel void @private_workgroup_monotonic_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -1606,8 +2077,15 @@ define amdgpu_kernel void @private_workgroup_monotonic_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1618,8 +2096,15 @@ define amdgpu_kernel void @private_workgroup_monotonic_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1628,8 +2113,14 @@ define amdgpu_kernel void @private_workgroup_monotonic_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_workgroup_monotonic_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -1637,8 +2128,14 @@ define amdgpu_kernel void @private_workgroup_monotonic_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_workgroup_monotonic_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm @@ -1646,8 +2143,14 @@ define amdgpu_kernel void @private_workgroup_monotonic_atomicrmw( ; GFX11-WGP-LABEL: private_workgroup_monotonic_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm @@ -1655,8 +2158,14 @@ define amdgpu_kernel void @private_workgroup_monotonic_atomicrmw( ; GFX11-CU-LABEL: private_workgroup_monotonic_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm @@ -1664,8 +2173,14 @@ define amdgpu_kernel void @private_workgroup_monotonic_atomicrmw( ; GFX12-WGP-LABEL: private_workgroup_monotonic_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm @@ -1673,8 +2188,14 @@ define amdgpu_kernel void @private_workgroup_monotonic_atomicrmw( ; GFX12-CU-LABEL: private_workgroup_monotonic_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm @@ -1682,29 +2203,34 @@ define amdgpu_kernel void @private_workgroup_monotonic_atomicrmw( ; GFX1250-LABEL: private_workgroup_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 ; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 @@ -1721,8 +2247,15 @@ define amdgpu_kernel void @private_workgroup_acquire_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1733,8 +2266,15 @@ define amdgpu_kernel void @private_workgroup_acquire_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1745,8 +2285,15 @@ define amdgpu_kernel void @private_workgroup_acquire_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1757,8 +2304,15 @@ define amdgpu_kernel void @private_workgroup_acquire_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1773,8 +2327,15 @@ define amdgpu_kernel void @private_workgroup_acquire_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -1785,8 +2346,15 @@ define amdgpu_kernel void @private_workgroup_acquire_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1797,8 +2365,15 @@ define amdgpu_kernel void @private_workgroup_acquire_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1807,8 +2382,14 @@ define amdgpu_kernel void @private_workgroup_acquire_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_workgroup_acquire_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -1816,8 +2397,14 @@ define amdgpu_kernel void @private_workgroup_acquire_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_workgroup_acquire_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm @@ -1825,8 +2412,14 @@ define amdgpu_kernel void @private_workgroup_acquire_atomicrmw( ; GFX11-WGP-LABEL: private_workgroup_acquire_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm @@ -1834,8 +2427,14 @@ define amdgpu_kernel void @private_workgroup_acquire_atomicrmw( ; GFX11-CU-LABEL: private_workgroup_acquire_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm @@ -1843,8 +2442,14 @@ define amdgpu_kernel void @private_workgroup_acquire_atomicrmw( ; GFX12-WGP-LABEL: private_workgroup_acquire_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm @@ -1852,8 +2457,14 @@ define amdgpu_kernel void @private_workgroup_acquire_atomicrmw( ; GFX12-CU-LABEL: private_workgroup_acquire_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm @@ -1861,29 +2472,34 @@ define amdgpu_kernel void @private_workgroup_acquire_atomicrmw( ; GFX1250-LABEL: private_workgroup_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 ; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 @@ -1901,8 +2517,15 @@ define amdgpu_kernel void @private_workgroup_release_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1913,8 +2536,15 @@ define amdgpu_kernel void @private_workgroup_release_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1925,8 +2555,15 @@ define amdgpu_kernel void @private_workgroup_release_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1937,8 +2574,15 @@ define amdgpu_kernel void @private_workgroup_release_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1953,8 +2597,15 @@ define amdgpu_kernel void @private_workgroup_release_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -1965,8 +2616,15 @@ define amdgpu_kernel void @private_workgroup_release_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1977,8 +2635,15 @@ define amdgpu_kernel void @private_workgroup_release_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -1987,8 +2652,14 @@ define amdgpu_kernel void @private_workgroup_release_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_workgroup_release_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -1996,8 +2667,14 @@ define amdgpu_kernel void @private_workgroup_release_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_workgroup_release_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm @@ -2005,8 +2682,14 @@ define amdgpu_kernel void @private_workgroup_release_atomicrmw( ; GFX11-WGP-LABEL: private_workgroup_release_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm @@ -2014,8 +2697,14 @@ define amdgpu_kernel void @private_workgroup_release_atomicrmw( ; GFX11-CU-LABEL: private_workgroup_release_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm @@ -2023,8 +2712,14 @@ define amdgpu_kernel void @private_workgroup_release_atomicrmw( ; GFX12-WGP-LABEL: private_workgroup_release_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm @@ -2032,8 +2727,14 @@ define amdgpu_kernel void @private_workgroup_release_atomicrmw( ; GFX12-CU-LABEL: private_workgroup_release_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm @@ -2041,29 +2742,34 @@ define amdgpu_kernel void @private_workgroup_release_atomicrmw( ; GFX1250-LABEL: private_workgroup_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 ; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x0 @@ -2082,8 +2788,15 @@ define amdgpu_kernel void @private_workgroup_acq_rel_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -2094,8 +2807,15 @@ define amdgpu_kernel void @private_workgroup_acq_rel_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -2106,8 +2826,15 @@ define amdgpu_kernel void @private_workgroup_acq_rel_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -2118,8 +2845,15 @@ define amdgpu_kernel void @private_workgroup_acq_rel_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -2134,8 +2868,15 @@ define amdgpu_kernel void @private_workgroup_acq_rel_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -2146,8 +2887,15 @@ define amdgpu_kernel void @private_workgroup_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -2158,8 +2906,15 @@ define amdgpu_kernel void @private_workgroup_acq_rel_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -2168,8 +2923,14 @@ define amdgpu_kernel void @private_workgroup_acq_rel_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_workgroup_acq_rel_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -2177,8 +2938,14 @@ define amdgpu_kernel void @private_workgroup_acq_rel_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_workgroup_acq_rel_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm @@ -2186,8 +2953,14 @@ define amdgpu_kernel void @private_workgroup_acq_rel_atomicrmw( ; GFX11-WGP-LABEL: private_workgroup_acq_rel_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm @@ -2195,8 +2968,14 @@ define amdgpu_kernel void @private_workgroup_acq_rel_atomicrmw( ; GFX11-CU-LABEL: private_workgroup_acq_rel_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm @@ -2204,8 +2983,14 @@ define amdgpu_kernel void @private_workgroup_acq_rel_atomicrmw( ; GFX12-WGP-LABEL: private_workgroup_acq_rel_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm @@ -2213,8 +2998,14 @@ define amdgpu_kernel void @private_workgroup_acq_rel_atomicrmw( ; GFX12-CU-LABEL: private_workgroup_acq_rel_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm @@ -2222,29 +3013,34 @@ define amdgpu_kernel void @private_workgroup_acq_rel_atomicrmw( ; GFX1250-LABEL: private_workgroup_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 ; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x0 @@ -2264,8 +3060,15 @@ define amdgpu_kernel void @private_workgroup_seq_cst_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -2276,8 +3079,15 @@ define amdgpu_kernel void @private_workgroup_seq_cst_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -2288,8 +3098,15 @@ define amdgpu_kernel void @private_workgroup_seq_cst_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -2300,8 +3117,15 @@ define amdgpu_kernel void @private_workgroup_seq_cst_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -2316,8 +3140,15 @@ define amdgpu_kernel void @private_workgroup_seq_cst_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -2328,8 +3159,15 @@ define amdgpu_kernel void @private_workgroup_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -2340,8 +3178,15 @@ define amdgpu_kernel void @private_workgroup_seq_cst_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -2350,8 +3195,14 @@ define amdgpu_kernel void @private_workgroup_seq_cst_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_workgroup_seq_cst_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -2359,8 +3210,14 @@ define amdgpu_kernel void @private_workgroup_seq_cst_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_workgroup_seq_cst_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm @@ -2368,8 +3225,14 @@ define amdgpu_kernel void @private_workgroup_seq_cst_atomicrmw( ; GFX11-WGP-LABEL: private_workgroup_seq_cst_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm @@ -2377,8 +3240,14 @@ define amdgpu_kernel void @private_workgroup_seq_cst_atomicrmw( ; GFX11-CU-LABEL: private_workgroup_seq_cst_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm @@ -2386,8 +3255,14 @@ define amdgpu_kernel void @private_workgroup_seq_cst_atomicrmw( ; GFX12-WGP-LABEL: private_workgroup_seq_cst_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm @@ -2395,8 +3270,14 @@ define amdgpu_kernel void @private_workgroup_seq_cst_atomicrmw( ; GFX12-CU-LABEL: private_workgroup_seq_cst_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm @@ -2404,29 +3285,34 @@ define amdgpu_kernel void @private_workgroup_seq_cst_atomicrmw( ; GFX1250-LABEL: private_workgroup_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 ; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x0 @@ -2446,6 +3332,10 @@ define amdgpu_kernel void @private_workgroup_acquire_ret_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -2464,6 +3354,10 @@ define amdgpu_kernel void @private_workgroup_acquire_ret_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -2481,6 +3375,10 @@ define amdgpu_kernel void @private_workgroup_acquire_ret_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -2498,6 +3396,10 @@ define amdgpu_kernel void @private_workgroup_acquire_ret_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -2519,6 +3421,10 @@ define amdgpu_kernel void @private_workgroup_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -2536,6 +3442,10 @@ define amdgpu_kernel void @private_workgroup_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -2553,6 +3463,10 @@ define amdgpu_kernel void @private_workgroup_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -2568,6 +3482,10 @@ define amdgpu_kernel void @private_workgroup_acquire_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_workgroup_acquire_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 @@ -2580,6 +3498,10 @@ define amdgpu_kernel void @private_workgroup_acquire_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_workgroup_acquire_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 @@ -2592,6 +3514,10 @@ define amdgpu_kernel void @private_workgroup_acquire_ret_atomicrmw( ; GFX11-WGP-LABEL: private_workgroup_acquire_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 @@ -2604,6 +3530,10 @@ define amdgpu_kernel void @private_workgroup_acquire_ret_atomicrmw( ; GFX11-CU-LABEL: private_workgroup_acquire_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 @@ -2616,6 +3546,10 @@ define amdgpu_kernel void @private_workgroup_acquire_ret_atomicrmw( ; GFX12-WGP-LABEL: private_workgroup_acquire_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 @@ -2628,6 +3562,10 @@ define amdgpu_kernel void @private_workgroup_acquire_ret_atomicrmw( ; GFX12-CU-LABEL: private_workgroup_acquire_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 @@ -2641,22 +3579,26 @@ define amdgpu_kernel void @private_workgroup_acquire_ret_atomicrmw( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_lg_u32 s0, s2 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b32 s2, 0 ; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s0, s3 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -2682,6 +3624,10 @@ define amdgpu_kernel void @private_workgroup_acq_rel_ret_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -2700,6 +3646,10 @@ define amdgpu_kernel void @private_workgroup_acq_rel_ret_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -2717,6 +3667,10 @@ define amdgpu_kernel void @private_workgroup_acq_rel_ret_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -2734,6 +3688,10 @@ define amdgpu_kernel void @private_workgroup_acq_rel_ret_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -2755,6 +3713,10 @@ define amdgpu_kernel void @private_workgroup_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -2772,6 +3734,10 @@ define amdgpu_kernel void @private_workgroup_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -2789,6 +3755,10 @@ define amdgpu_kernel void @private_workgroup_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -2804,6 +3774,10 @@ define amdgpu_kernel void @private_workgroup_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_workgroup_acq_rel_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 @@ -2816,6 +3790,10 @@ define amdgpu_kernel void @private_workgroup_acq_rel_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_workgroup_acq_rel_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 @@ -2828,6 +3806,10 @@ define amdgpu_kernel void @private_workgroup_acq_rel_ret_atomicrmw( ; GFX11-WGP-LABEL: private_workgroup_acq_rel_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 @@ -2840,6 +3822,10 @@ define amdgpu_kernel void @private_workgroup_acq_rel_ret_atomicrmw( ; GFX11-CU-LABEL: private_workgroup_acq_rel_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 @@ -2852,6 +3838,10 @@ define amdgpu_kernel void @private_workgroup_acq_rel_ret_atomicrmw( ; GFX12-WGP-LABEL: private_workgroup_acq_rel_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 @@ -2864,6 +3854,10 @@ define amdgpu_kernel void @private_workgroup_acq_rel_ret_atomicrmw( ; GFX12-CU-LABEL: private_workgroup_acq_rel_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 @@ -2877,22 +3871,26 @@ define amdgpu_kernel void @private_workgroup_acq_rel_ret_atomicrmw( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_lg_u32 s0, s2 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b32 s2, 0 ; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s0, s3 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -2920,6 +3918,10 @@ define amdgpu_kernel void @private_workgroup_seq_cst_ret_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -2938,6 +3940,10 @@ define amdgpu_kernel void @private_workgroup_seq_cst_ret_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -2955,6 +3961,10 @@ define amdgpu_kernel void @private_workgroup_seq_cst_ret_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -2972,6 +3982,10 @@ define amdgpu_kernel void @private_workgroup_seq_cst_ret_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -2993,6 +4007,10 @@ define amdgpu_kernel void @private_workgroup_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -3010,6 +4028,10 @@ define amdgpu_kernel void @private_workgroup_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -3027,6 +4049,10 @@ define amdgpu_kernel void @private_workgroup_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -3042,6 +4068,10 @@ define amdgpu_kernel void @private_workgroup_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_workgroup_seq_cst_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 @@ -3054,6 +4084,10 @@ define amdgpu_kernel void @private_workgroup_seq_cst_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_workgroup_seq_cst_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 @@ -3066,6 +4100,10 @@ define amdgpu_kernel void @private_workgroup_seq_cst_ret_atomicrmw( ; GFX11-WGP-LABEL: private_workgroup_seq_cst_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 @@ -3078,6 +4116,10 @@ define amdgpu_kernel void @private_workgroup_seq_cst_ret_atomicrmw( ; GFX11-CU-LABEL: private_workgroup_seq_cst_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 @@ -3090,6 +4132,10 @@ define amdgpu_kernel void @private_workgroup_seq_cst_ret_atomicrmw( ; GFX12-WGP-LABEL: private_workgroup_seq_cst_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 @@ -3102,6 +4148,10 @@ define amdgpu_kernel void @private_workgroup_seq_cst_ret_atomicrmw( ; GFX12-CU-LABEL: private_workgroup_seq_cst_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 @@ -3115,22 +4165,26 @@ define amdgpu_kernel void @private_workgroup_seq_cst_ret_atomicrmw( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_lg_u32 s0, s2 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b32 s2, 0 ; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s0, s3 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -3158,6 +4212,12 @@ define amdgpu_kernel void @private_workgroup_monotonic_monotonic_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -3178,6 +4238,12 @@ define amdgpu_kernel void @private_workgroup_monotonic_monotonic_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -3198,6 +4264,12 @@ define amdgpu_kernel void @private_workgroup_monotonic_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -3215,6 +4287,12 @@ define amdgpu_kernel void @private_workgroup_monotonic_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -3236,6 +4314,12 @@ define amdgpu_kernel void @private_workgroup_monotonic_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -3256,6 +4340,12 @@ define amdgpu_kernel void @private_workgroup_monotonic_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3274,6 +4364,12 @@ define amdgpu_kernel void @private_workgroup_monotonic_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3290,6 +4386,12 @@ define amdgpu_kernel void @private_workgroup_monotonic_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_workgroup_monotonic_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -3307,6 +4409,12 @@ define amdgpu_kernel void @private_workgroup_monotonic_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_workgroup_monotonic_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -3324,6 +4432,12 @@ define amdgpu_kernel void @private_workgroup_monotonic_monotonic_cmpxchg( ; GFX11-WGP-LABEL: private_workgroup_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -3339,6 +4453,12 @@ define amdgpu_kernel void @private_workgroup_monotonic_monotonic_cmpxchg( ; GFX11-CU-LABEL: private_workgroup_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -3354,6 +4474,12 @@ define amdgpu_kernel void @private_workgroup_monotonic_monotonic_cmpxchg( ; GFX12-WGP-LABEL: private_workgroup_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -3368,6 +4494,12 @@ define amdgpu_kernel void @private_workgroup_monotonic_monotonic_cmpxchg( ; GFX12-CU-LABEL: private_workgroup_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -3382,26 +4514,32 @@ define amdgpu_kernel void @private_workgroup_monotonic_monotonic_cmpxchg( ; GFX1250-LABEL: private_workgroup_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -3409,6 +4547,7 @@ define amdgpu_kernel void @private_workgroup_monotonic_monotonic_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -3428,6 +4567,12 @@ define amdgpu_kernel void @private_workgroup_acquire_monotonic_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -3448,6 +4593,12 @@ define amdgpu_kernel void @private_workgroup_acquire_monotonic_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -3468,6 +4619,12 @@ define amdgpu_kernel void @private_workgroup_acquire_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -3485,6 +4642,12 @@ define amdgpu_kernel void @private_workgroup_acquire_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -3506,6 +4669,12 @@ define amdgpu_kernel void @private_workgroup_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -3526,6 +4695,12 @@ define amdgpu_kernel void @private_workgroup_acquire_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3544,6 +4719,12 @@ define amdgpu_kernel void @private_workgroup_acquire_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3560,6 +4741,12 @@ define amdgpu_kernel void @private_workgroup_acquire_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_workgroup_acquire_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -3577,6 +4764,12 @@ define amdgpu_kernel void @private_workgroup_acquire_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_workgroup_acquire_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -3594,6 +4787,12 @@ define amdgpu_kernel void @private_workgroup_acquire_monotonic_cmpxchg( ; GFX11-WGP-LABEL: private_workgroup_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -3609,6 +4808,12 @@ define amdgpu_kernel void @private_workgroup_acquire_monotonic_cmpxchg( ; GFX11-CU-LABEL: private_workgroup_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -3624,6 +4829,12 @@ define amdgpu_kernel void @private_workgroup_acquire_monotonic_cmpxchg( ; GFX12-WGP-LABEL: private_workgroup_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -3638,6 +4849,12 @@ define amdgpu_kernel void @private_workgroup_acquire_monotonic_cmpxchg( ; GFX12-CU-LABEL: private_workgroup_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -3652,26 +4869,32 @@ define amdgpu_kernel void @private_workgroup_acquire_monotonic_cmpxchg( ; GFX1250-LABEL: private_workgroup_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -3679,6 +4902,7 @@ define amdgpu_kernel void @private_workgroup_acquire_monotonic_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -3699,6 +4923,12 @@ define amdgpu_kernel void @private_workgroup_release_monotonic_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -3719,6 +4949,12 @@ define amdgpu_kernel void @private_workgroup_release_monotonic_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -3739,6 +4975,12 @@ define amdgpu_kernel void @private_workgroup_release_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -3756,6 +4998,12 @@ define amdgpu_kernel void @private_workgroup_release_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -3777,6 +5025,12 @@ define amdgpu_kernel void @private_workgroup_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -3797,6 +5051,12 @@ define amdgpu_kernel void @private_workgroup_release_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3815,6 +5075,12 @@ define amdgpu_kernel void @private_workgroup_release_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3831,6 +5097,12 @@ define amdgpu_kernel void @private_workgroup_release_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_workgroup_release_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -3848,6 +5120,12 @@ define amdgpu_kernel void @private_workgroup_release_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_workgroup_release_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -3865,6 +5143,12 @@ define amdgpu_kernel void @private_workgroup_release_monotonic_cmpxchg( ; GFX11-WGP-LABEL: private_workgroup_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -3880,6 +5164,12 @@ define amdgpu_kernel void @private_workgroup_release_monotonic_cmpxchg( ; GFX11-CU-LABEL: private_workgroup_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -3895,6 +5185,12 @@ define amdgpu_kernel void @private_workgroup_release_monotonic_cmpxchg( ; GFX12-WGP-LABEL: private_workgroup_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -3909,6 +5205,12 @@ define amdgpu_kernel void @private_workgroup_release_monotonic_cmpxchg( ; GFX12-CU-LABEL: private_workgroup_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -3923,26 +5225,32 @@ define amdgpu_kernel void @private_workgroup_release_monotonic_cmpxchg( ; GFX1250-LABEL: private_workgroup_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -3950,6 +5258,7 @@ define amdgpu_kernel void @private_workgroup_release_monotonic_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -3971,6 +5280,12 @@ define amdgpu_kernel void @private_workgroup_acq_rel_monotonic_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -3991,6 +5306,12 @@ define amdgpu_kernel void @private_workgroup_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -4011,6 +5332,12 @@ define amdgpu_kernel void @private_workgroup_acq_rel_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4028,6 +5355,12 @@ define amdgpu_kernel void @private_workgroup_acq_rel_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4049,6 +5382,12 @@ define amdgpu_kernel void @private_workgroup_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -4069,6 +5408,12 @@ define amdgpu_kernel void @private_workgroup_acq_rel_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4087,6 +5432,12 @@ define amdgpu_kernel void @private_workgroup_acq_rel_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4103,6 +5454,12 @@ define amdgpu_kernel void @private_workgroup_acq_rel_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_workgroup_acq_rel_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -4120,6 +5477,12 @@ define amdgpu_kernel void @private_workgroup_acq_rel_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_workgroup_acq_rel_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -4137,6 +5500,12 @@ define amdgpu_kernel void @private_workgroup_acq_rel_monotonic_cmpxchg( ; GFX11-WGP-LABEL: private_workgroup_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -4152,6 +5521,12 @@ define amdgpu_kernel void @private_workgroup_acq_rel_monotonic_cmpxchg( ; GFX11-CU-LABEL: private_workgroup_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -4167,6 +5542,12 @@ define amdgpu_kernel void @private_workgroup_acq_rel_monotonic_cmpxchg( ; GFX12-WGP-LABEL: private_workgroup_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -4181,6 +5562,12 @@ define amdgpu_kernel void @private_workgroup_acq_rel_monotonic_cmpxchg( ; GFX12-CU-LABEL: private_workgroup_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -4195,26 +5582,32 @@ define amdgpu_kernel void @private_workgroup_acq_rel_monotonic_cmpxchg( ; GFX1250-LABEL: private_workgroup_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -4222,6 +5615,7 @@ define amdgpu_kernel void @private_workgroup_acq_rel_monotonic_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -4244,6 +5638,12 @@ define amdgpu_kernel void @private_workgroup_seq_cst_monotonic_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -4264,6 +5664,12 @@ define amdgpu_kernel void @private_workgroup_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -4284,6 +5690,12 @@ define amdgpu_kernel void @private_workgroup_seq_cst_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4301,6 +5713,12 @@ define amdgpu_kernel void @private_workgroup_seq_cst_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4322,6 +5740,12 @@ define amdgpu_kernel void @private_workgroup_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -4342,6 +5766,12 @@ define amdgpu_kernel void @private_workgroup_seq_cst_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4360,6 +5790,12 @@ define amdgpu_kernel void @private_workgroup_seq_cst_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4376,6 +5812,12 @@ define amdgpu_kernel void @private_workgroup_seq_cst_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_workgroup_seq_cst_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -4393,6 +5835,12 @@ define amdgpu_kernel void @private_workgroup_seq_cst_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_workgroup_seq_cst_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -4410,6 +5858,12 @@ define amdgpu_kernel void @private_workgroup_seq_cst_monotonic_cmpxchg( ; GFX11-WGP-LABEL: private_workgroup_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -4425,6 +5879,12 @@ define amdgpu_kernel void @private_workgroup_seq_cst_monotonic_cmpxchg( ; GFX11-CU-LABEL: private_workgroup_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -4440,6 +5900,12 @@ define amdgpu_kernel void @private_workgroup_seq_cst_monotonic_cmpxchg( ; GFX12-WGP-LABEL: private_workgroup_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -4454,6 +5920,12 @@ define amdgpu_kernel void @private_workgroup_seq_cst_monotonic_cmpxchg( ; GFX12-CU-LABEL: private_workgroup_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -4468,26 +5940,32 @@ define amdgpu_kernel void @private_workgroup_seq_cst_monotonic_cmpxchg( ; GFX1250-LABEL: private_workgroup_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -4495,6 +5973,7 @@ define amdgpu_kernel void @private_workgroup_seq_cst_monotonic_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -4517,6 +5996,12 @@ define amdgpu_kernel void @private_workgroup_monotonic_acquire_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -4537,6 +6022,12 @@ define amdgpu_kernel void @private_workgroup_monotonic_acquire_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -4557,6 +6048,12 @@ define amdgpu_kernel void @private_workgroup_monotonic_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4574,6 +6071,12 @@ define amdgpu_kernel void @private_workgroup_monotonic_acquire_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4595,6 +6098,12 @@ define amdgpu_kernel void @private_workgroup_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -4615,6 +6124,12 @@ define amdgpu_kernel void @private_workgroup_monotonic_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4633,6 +6148,12 @@ define amdgpu_kernel void @private_workgroup_monotonic_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4649,6 +6170,12 @@ define amdgpu_kernel void @private_workgroup_monotonic_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_workgroup_monotonic_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -4666,6 +6193,12 @@ define amdgpu_kernel void @private_workgroup_monotonic_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_workgroup_monotonic_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -4683,6 +6216,12 @@ define amdgpu_kernel void @private_workgroup_monotonic_acquire_cmpxchg( ; GFX11-WGP-LABEL: private_workgroup_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -4698,6 +6237,12 @@ define amdgpu_kernel void @private_workgroup_monotonic_acquire_cmpxchg( ; GFX11-CU-LABEL: private_workgroup_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -4713,6 +6258,12 @@ define amdgpu_kernel void @private_workgroup_monotonic_acquire_cmpxchg( ; GFX12-WGP-LABEL: private_workgroup_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -4727,6 +6278,12 @@ define amdgpu_kernel void @private_workgroup_monotonic_acquire_cmpxchg( ; GFX12-CU-LABEL: private_workgroup_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -4741,26 +6298,32 @@ define amdgpu_kernel void @private_workgroup_monotonic_acquire_cmpxchg( ; GFX1250-LABEL: private_workgroup_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -4768,6 +6331,7 @@ define amdgpu_kernel void @private_workgroup_monotonic_acquire_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -4788,6 +6352,12 @@ define amdgpu_kernel void @private_workgroup_acquire_acquire_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -4808,6 +6378,12 @@ define amdgpu_kernel void @private_workgroup_acquire_acquire_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -4828,6 +6404,12 @@ define amdgpu_kernel void @private_workgroup_acquire_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4845,6 +6427,12 @@ define amdgpu_kernel void @private_workgroup_acquire_acquire_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4866,6 +6454,12 @@ define amdgpu_kernel void @private_workgroup_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -4886,6 +6480,12 @@ define amdgpu_kernel void @private_workgroup_acquire_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4904,6 +6504,12 @@ define amdgpu_kernel void @private_workgroup_acquire_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4920,6 +6526,12 @@ define amdgpu_kernel void @private_workgroup_acquire_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_workgroup_acquire_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -4937,6 +6549,12 @@ define amdgpu_kernel void @private_workgroup_acquire_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_workgroup_acquire_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -4954,6 +6572,12 @@ define amdgpu_kernel void @private_workgroup_acquire_acquire_cmpxchg( ; GFX11-WGP-LABEL: private_workgroup_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -4969,6 +6593,12 @@ define amdgpu_kernel void @private_workgroup_acquire_acquire_cmpxchg( ; GFX11-CU-LABEL: private_workgroup_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -4984,6 +6614,12 @@ define amdgpu_kernel void @private_workgroup_acquire_acquire_cmpxchg( ; GFX12-WGP-LABEL: private_workgroup_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -4998,6 +6634,12 @@ define amdgpu_kernel void @private_workgroup_acquire_acquire_cmpxchg( ; GFX12-CU-LABEL: private_workgroup_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -5012,26 +6654,32 @@ define amdgpu_kernel void @private_workgroup_acquire_acquire_cmpxchg( ; GFX1250-LABEL: private_workgroup_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -5039,6 +6687,7 @@ define amdgpu_kernel void @private_workgroup_acquire_acquire_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -5059,6 +6708,12 @@ define amdgpu_kernel void @private_workgroup_release_acquire_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -5079,6 +6734,12 @@ define amdgpu_kernel void @private_workgroup_release_acquire_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -5099,6 +6760,12 @@ define amdgpu_kernel void @private_workgroup_release_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5116,6 +6783,12 @@ define amdgpu_kernel void @private_workgroup_release_acquire_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5137,6 +6810,12 @@ define amdgpu_kernel void @private_workgroup_release_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -5157,6 +6836,12 @@ define amdgpu_kernel void @private_workgroup_release_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5175,6 +6860,12 @@ define amdgpu_kernel void @private_workgroup_release_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5191,6 +6882,12 @@ define amdgpu_kernel void @private_workgroup_release_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_workgroup_release_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -5208,6 +6905,12 @@ define amdgpu_kernel void @private_workgroup_release_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_workgroup_release_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -5225,6 +6928,12 @@ define amdgpu_kernel void @private_workgroup_release_acquire_cmpxchg( ; GFX11-WGP-LABEL: private_workgroup_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -5240,6 +6949,12 @@ define amdgpu_kernel void @private_workgroup_release_acquire_cmpxchg( ; GFX11-CU-LABEL: private_workgroup_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -5255,6 +6970,12 @@ define amdgpu_kernel void @private_workgroup_release_acquire_cmpxchg( ; GFX12-WGP-LABEL: private_workgroup_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -5269,6 +6990,12 @@ define amdgpu_kernel void @private_workgroup_release_acquire_cmpxchg( ; GFX12-CU-LABEL: private_workgroup_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -5283,26 +7010,32 @@ define amdgpu_kernel void @private_workgroup_release_acquire_cmpxchg( ; GFX1250-LABEL: private_workgroup_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -5310,6 +7043,7 @@ define amdgpu_kernel void @private_workgroup_release_acquire_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -5332,6 +7066,12 @@ define amdgpu_kernel void @private_workgroup_acq_rel_acquire_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -5352,6 +7092,12 @@ define amdgpu_kernel void @private_workgroup_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -5372,6 +7118,12 @@ define amdgpu_kernel void @private_workgroup_acq_rel_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5389,6 +7141,12 @@ define amdgpu_kernel void @private_workgroup_acq_rel_acquire_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5410,6 +7168,12 @@ define amdgpu_kernel void @private_workgroup_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -5430,6 +7194,12 @@ define amdgpu_kernel void @private_workgroup_acq_rel_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5448,6 +7218,12 @@ define amdgpu_kernel void @private_workgroup_acq_rel_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5464,6 +7240,12 @@ define amdgpu_kernel void @private_workgroup_acq_rel_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_workgroup_acq_rel_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -5481,6 +7263,12 @@ define amdgpu_kernel void @private_workgroup_acq_rel_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_workgroup_acq_rel_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -5498,6 +7286,12 @@ define amdgpu_kernel void @private_workgroup_acq_rel_acquire_cmpxchg( ; GFX11-WGP-LABEL: private_workgroup_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -5513,6 +7307,12 @@ define amdgpu_kernel void @private_workgroup_acq_rel_acquire_cmpxchg( ; GFX11-CU-LABEL: private_workgroup_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -5528,6 +7328,12 @@ define amdgpu_kernel void @private_workgroup_acq_rel_acquire_cmpxchg( ; GFX12-WGP-LABEL: private_workgroup_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -5542,6 +7348,12 @@ define amdgpu_kernel void @private_workgroup_acq_rel_acquire_cmpxchg( ; GFX12-CU-LABEL: private_workgroup_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -5556,26 +7368,32 @@ define amdgpu_kernel void @private_workgroup_acq_rel_acquire_cmpxchg( ; GFX1250-LABEL: private_workgroup_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -5583,6 +7401,7 @@ define amdgpu_kernel void @private_workgroup_acq_rel_acquire_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -5605,6 +7424,12 @@ define amdgpu_kernel void @private_workgroup_seq_cst_acquire_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -5625,6 +7450,12 @@ define amdgpu_kernel void @private_workgroup_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -5645,6 +7476,12 @@ define amdgpu_kernel void @private_workgroup_seq_cst_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5662,6 +7499,12 @@ define amdgpu_kernel void @private_workgroup_seq_cst_acquire_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5683,6 +7526,12 @@ define amdgpu_kernel void @private_workgroup_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -5703,6 +7552,12 @@ define amdgpu_kernel void @private_workgroup_seq_cst_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5721,6 +7576,12 @@ define amdgpu_kernel void @private_workgroup_seq_cst_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5737,6 +7598,12 @@ define amdgpu_kernel void @private_workgroup_seq_cst_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_workgroup_seq_cst_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -5754,6 +7621,12 @@ define amdgpu_kernel void @private_workgroup_seq_cst_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_workgroup_seq_cst_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -5771,6 +7644,12 @@ define amdgpu_kernel void @private_workgroup_seq_cst_acquire_cmpxchg( ; GFX11-WGP-LABEL: private_workgroup_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -5786,6 +7665,12 @@ define amdgpu_kernel void @private_workgroup_seq_cst_acquire_cmpxchg( ; GFX11-CU-LABEL: private_workgroup_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -5801,6 +7686,12 @@ define amdgpu_kernel void @private_workgroup_seq_cst_acquire_cmpxchg( ; GFX12-WGP-LABEL: private_workgroup_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -5815,6 +7706,12 @@ define amdgpu_kernel void @private_workgroup_seq_cst_acquire_cmpxchg( ; GFX12-CU-LABEL: private_workgroup_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -5829,26 +7726,32 @@ define amdgpu_kernel void @private_workgroup_seq_cst_acquire_cmpxchg( ; GFX1250-LABEL: private_workgroup_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -5856,6 +7759,7 @@ define amdgpu_kernel void @private_workgroup_seq_cst_acquire_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -5878,6 +7782,12 @@ define amdgpu_kernel void @private_workgroup_monotonic_seq_cst_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -5898,6 +7808,12 @@ define amdgpu_kernel void @private_workgroup_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -5918,6 +7834,12 @@ define amdgpu_kernel void @private_workgroup_monotonic_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5935,6 +7857,12 @@ define amdgpu_kernel void @private_workgroup_monotonic_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5956,6 +7884,12 @@ define amdgpu_kernel void @private_workgroup_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -5976,6 +7910,12 @@ define amdgpu_kernel void @private_workgroup_monotonic_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5994,6 +7934,12 @@ define amdgpu_kernel void @private_workgroup_monotonic_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6010,6 +7956,12 @@ define amdgpu_kernel void @private_workgroup_monotonic_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_workgroup_monotonic_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -6027,6 +7979,12 @@ define amdgpu_kernel void @private_workgroup_monotonic_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_workgroup_monotonic_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -6044,6 +8002,12 @@ define amdgpu_kernel void @private_workgroup_monotonic_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: private_workgroup_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -6059,6 +8023,12 @@ define amdgpu_kernel void @private_workgroup_monotonic_seq_cst_cmpxchg( ; GFX11-CU-LABEL: private_workgroup_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -6074,6 +8044,12 @@ define amdgpu_kernel void @private_workgroup_monotonic_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: private_workgroup_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -6088,6 +8064,12 @@ define amdgpu_kernel void @private_workgroup_monotonic_seq_cst_cmpxchg( ; GFX12-CU-LABEL: private_workgroup_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -6102,26 +8084,32 @@ define amdgpu_kernel void @private_workgroup_monotonic_seq_cst_cmpxchg( ; GFX1250-LABEL: private_workgroup_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -6129,6 +8117,7 @@ define amdgpu_kernel void @private_workgroup_monotonic_seq_cst_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -6151,6 +8140,12 @@ define amdgpu_kernel void @private_workgroup_acquire_seq_cst_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -6171,6 +8166,12 @@ define amdgpu_kernel void @private_workgroup_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -6191,6 +8192,12 @@ define amdgpu_kernel void @private_workgroup_acquire_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -6208,6 +8215,12 @@ define amdgpu_kernel void @private_workgroup_acquire_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6229,6 +8242,12 @@ define amdgpu_kernel void @private_workgroup_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -6249,6 +8268,12 @@ define amdgpu_kernel void @private_workgroup_acquire_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6267,6 +8292,12 @@ define amdgpu_kernel void @private_workgroup_acquire_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6283,6 +8314,12 @@ define amdgpu_kernel void @private_workgroup_acquire_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_workgroup_acquire_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -6300,6 +8337,12 @@ define amdgpu_kernel void @private_workgroup_acquire_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_workgroup_acquire_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -6317,6 +8360,12 @@ define amdgpu_kernel void @private_workgroup_acquire_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: private_workgroup_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -6332,6 +8381,12 @@ define amdgpu_kernel void @private_workgroup_acquire_seq_cst_cmpxchg( ; GFX11-CU-LABEL: private_workgroup_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -6347,6 +8402,12 @@ define amdgpu_kernel void @private_workgroup_acquire_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: private_workgroup_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -6361,6 +8422,12 @@ define amdgpu_kernel void @private_workgroup_acquire_seq_cst_cmpxchg( ; GFX12-CU-LABEL: private_workgroup_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -6375,26 +8442,32 @@ define amdgpu_kernel void @private_workgroup_acquire_seq_cst_cmpxchg( ; GFX1250-LABEL: private_workgroup_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -6402,6 +8475,7 @@ define amdgpu_kernel void @private_workgroup_acquire_seq_cst_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -6424,6 +8498,12 @@ define amdgpu_kernel void @private_workgroup_release_seq_cst_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -6444,6 +8524,12 @@ define amdgpu_kernel void @private_workgroup_release_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -6464,6 +8550,12 @@ define amdgpu_kernel void @private_workgroup_release_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -6481,6 +8573,12 @@ define amdgpu_kernel void @private_workgroup_release_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6502,6 +8600,12 @@ define amdgpu_kernel void @private_workgroup_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -6522,6 +8626,12 @@ define amdgpu_kernel void @private_workgroup_release_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6540,6 +8650,12 @@ define amdgpu_kernel void @private_workgroup_release_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6556,6 +8672,12 @@ define amdgpu_kernel void @private_workgroup_release_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_workgroup_release_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -6573,6 +8695,12 @@ define amdgpu_kernel void @private_workgroup_release_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_workgroup_release_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -6590,6 +8718,12 @@ define amdgpu_kernel void @private_workgroup_release_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: private_workgroup_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -6605,6 +8739,12 @@ define amdgpu_kernel void @private_workgroup_release_seq_cst_cmpxchg( ; GFX11-CU-LABEL: private_workgroup_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -6620,6 +8760,12 @@ define amdgpu_kernel void @private_workgroup_release_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: private_workgroup_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -6634,6 +8780,12 @@ define amdgpu_kernel void @private_workgroup_release_seq_cst_cmpxchg( ; GFX12-CU-LABEL: private_workgroup_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -6648,26 +8800,32 @@ define amdgpu_kernel void @private_workgroup_release_seq_cst_cmpxchg( ; GFX1250-LABEL: private_workgroup_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -6675,6 +8833,7 @@ define amdgpu_kernel void @private_workgroup_release_seq_cst_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -6697,6 +8856,12 @@ define amdgpu_kernel void @private_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -6717,6 +8882,12 @@ define amdgpu_kernel void @private_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -6737,6 +8908,12 @@ define amdgpu_kernel void @private_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -6754,6 +8931,12 @@ define amdgpu_kernel void @private_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6775,6 +8958,12 @@ define amdgpu_kernel void @private_workgroup_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -6795,6 +8984,12 @@ define amdgpu_kernel void @private_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6813,6 +9008,12 @@ define amdgpu_kernel void @private_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6829,6 +9030,12 @@ define amdgpu_kernel void @private_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -6846,6 +9053,12 @@ define amdgpu_kernel void @private_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -6863,6 +9076,12 @@ define amdgpu_kernel void @private_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: private_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -6878,6 +9097,12 @@ define amdgpu_kernel void @private_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX11-CU-LABEL: private_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -6893,6 +9118,12 @@ define amdgpu_kernel void @private_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: private_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -6907,6 +9138,12 @@ define amdgpu_kernel void @private_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-LABEL: private_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -6921,26 +9158,32 @@ define amdgpu_kernel void @private_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX1250-LABEL: private_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -6948,6 +9191,7 @@ define amdgpu_kernel void @private_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -6970,6 +9214,12 @@ define amdgpu_kernel void @private_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -6990,6 +9240,12 @@ define amdgpu_kernel void @private_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -7010,6 +9266,12 @@ define amdgpu_kernel void @private_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7027,6 +9289,12 @@ define amdgpu_kernel void @private_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -7048,6 +9316,12 @@ define amdgpu_kernel void @private_workgroup_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -7068,6 +9342,12 @@ define amdgpu_kernel void @private_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7086,6 +9366,12 @@ define amdgpu_kernel void @private_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7102,6 +9388,12 @@ define amdgpu_kernel void @private_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -7119,6 +9411,12 @@ define amdgpu_kernel void @private_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -7136,6 +9434,12 @@ define amdgpu_kernel void @private_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: private_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -7151,6 +9455,12 @@ define amdgpu_kernel void @private_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX11-CU-LABEL: private_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -7166,6 +9476,12 @@ define amdgpu_kernel void @private_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: private_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -7180,6 +9496,12 @@ define amdgpu_kernel void @private_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-LABEL: private_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -7194,26 +9516,32 @@ define amdgpu_kernel void @private_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX1250-LABEL: private_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -7221,6 +9549,7 @@ define amdgpu_kernel void @private_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -7244,6 +9573,12 @@ define amdgpu_kernel void @private_workgroup_monotonic_monotonic_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -7268,6 +9603,12 @@ define amdgpu_kernel void @private_workgroup_monotonic_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -7290,6 +9631,12 @@ define amdgpu_kernel void @private_workgroup_monotonic_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7309,6 +9656,12 @@ define amdgpu_kernel void @private_workgroup_monotonic_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -7333,6 +9686,12 @@ define amdgpu_kernel void @private_workgroup_monotonic_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -7355,6 +9714,12 @@ define amdgpu_kernel void @private_workgroup_monotonic_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7375,6 +9740,12 @@ define amdgpu_kernel void @private_workgroup_monotonic_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7394,6 +9765,12 @@ define amdgpu_kernel void @private_workgroup_monotonic_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -7414,6 +9791,12 @@ define amdgpu_kernel void @private_workgroup_monotonic_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -7433,6 +9816,12 @@ define amdgpu_kernel void @private_workgroup_monotonic_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: private_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -7449,6 +9838,12 @@ define amdgpu_kernel void @private_workgroup_monotonic_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: private_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -7465,6 +9860,12 @@ define amdgpu_kernel void @private_workgroup_monotonic_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: private_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -7480,6 +9881,12 @@ define amdgpu_kernel void @private_workgroup_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: private_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -7496,25 +9903,31 @@ define amdgpu_kernel void @private_workgroup_monotonic_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -7546,6 +9959,12 @@ define amdgpu_kernel void @private_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -7570,6 +9989,12 @@ define amdgpu_kernel void @private_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -7592,6 +10017,12 @@ define amdgpu_kernel void @private_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7611,6 +10042,12 @@ define amdgpu_kernel void @private_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -7635,6 +10072,12 @@ define amdgpu_kernel void @private_workgroup_acquire_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -7657,6 +10100,12 @@ define amdgpu_kernel void @private_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7677,6 +10126,12 @@ define amdgpu_kernel void @private_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7696,6 +10151,12 @@ define amdgpu_kernel void @private_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -7716,6 +10177,12 @@ define amdgpu_kernel void @private_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -7735,6 +10202,12 @@ define amdgpu_kernel void @private_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: private_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -7751,6 +10224,12 @@ define amdgpu_kernel void @private_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: private_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -7767,6 +10246,12 @@ define amdgpu_kernel void @private_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: private_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -7782,6 +10267,12 @@ define amdgpu_kernel void @private_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: private_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -7798,25 +10289,31 @@ define amdgpu_kernel void @private_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -7848,6 +10345,12 @@ define amdgpu_kernel void @private_workgroup_release_monotonic_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -7872,6 +10375,12 @@ define amdgpu_kernel void @private_workgroup_release_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -7894,6 +10403,12 @@ define amdgpu_kernel void @private_workgroup_release_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -7913,6 +10428,12 @@ define amdgpu_kernel void @private_workgroup_release_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -7937,6 +10458,12 @@ define amdgpu_kernel void @private_workgroup_release_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -7959,6 +10486,12 @@ define amdgpu_kernel void @private_workgroup_release_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7979,6 +10512,12 @@ define amdgpu_kernel void @private_workgroup_release_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7998,6 +10537,12 @@ define amdgpu_kernel void @private_workgroup_release_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -8018,6 +10563,12 @@ define amdgpu_kernel void @private_workgroup_release_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -8037,6 +10588,12 @@ define amdgpu_kernel void @private_workgroup_release_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: private_workgroup_release_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -8053,6 +10610,12 @@ define amdgpu_kernel void @private_workgroup_release_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: private_workgroup_release_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -8069,6 +10632,12 @@ define amdgpu_kernel void @private_workgroup_release_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: private_workgroup_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -8084,6 +10653,12 @@ define amdgpu_kernel void @private_workgroup_release_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: private_workgroup_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -8100,25 +10675,31 @@ define amdgpu_kernel void @private_workgroup_release_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -8152,6 +10733,12 @@ define amdgpu_kernel void @private_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -8176,6 +10763,12 @@ define amdgpu_kernel void @private_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -8198,6 +10791,12 @@ define amdgpu_kernel void @private_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -8217,6 +10816,12 @@ define amdgpu_kernel void @private_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -8241,6 +10846,12 @@ define amdgpu_kernel void @private_workgroup_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -8263,6 +10874,12 @@ define amdgpu_kernel void @private_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8283,6 +10900,12 @@ define amdgpu_kernel void @private_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8302,6 +10925,12 @@ define amdgpu_kernel void @private_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -8322,6 +10951,12 @@ define amdgpu_kernel void @private_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -8341,6 +10976,12 @@ define amdgpu_kernel void @private_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: private_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -8357,6 +10998,12 @@ define amdgpu_kernel void @private_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: private_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -8373,6 +11020,12 @@ define amdgpu_kernel void @private_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: private_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -8388,6 +11041,12 @@ define amdgpu_kernel void @private_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: private_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -8404,25 +11063,31 @@ define amdgpu_kernel void @private_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -8456,6 +11121,12 @@ define amdgpu_kernel void @private_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -8480,6 +11151,12 @@ define amdgpu_kernel void @private_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -8502,6 +11179,12 @@ define amdgpu_kernel void @private_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -8521,6 +11204,12 @@ define amdgpu_kernel void @private_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -8545,6 +11234,12 @@ define amdgpu_kernel void @private_workgroup_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -8567,6 +11262,12 @@ define amdgpu_kernel void @private_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8587,6 +11288,12 @@ define amdgpu_kernel void @private_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8606,6 +11313,12 @@ define amdgpu_kernel void @private_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -8626,6 +11339,12 @@ define amdgpu_kernel void @private_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -8645,6 +11364,12 @@ define amdgpu_kernel void @private_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX11-WGP-LABEL: private_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -8661,6 +11386,12 @@ define amdgpu_kernel void @private_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX11-CU-LABEL: private_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -8677,6 +11408,12 @@ define amdgpu_kernel void @private_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-LABEL: private_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -8692,6 +11429,12 @@ define amdgpu_kernel void @private_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-LABEL: private_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -8708,25 +11451,31 @@ define amdgpu_kernel void @private_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -8760,6 +11509,12 @@ define amdgpu_kernel void @private_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -8784,6 +11539,12 @@ define amdgpu_kernel void @private_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -8806,6 +11567,12 @@ define amdgpu_kernel void @private_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -8825,6 +11592,12 @@ define amdgpu_kernel void @private_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -8849,6 +11622,12 @@ define amdgpu_kernel void @private_workgroup_monotonic_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -8871,6 +11650,12 @@ define amdgpu_kernel void @private_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8891,6 +11676,12 @@ define amdgpu_kernel void @private_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8910,6 +11701,12 @@ define amdgpu_kernel void @private_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -8930,6 +11727,12 @@ define amdgpu_kernel void @private_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -8949,6 +11752,12 @@ define amdgpu_kernel void @private_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: private_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -8965,6 +11774,12 @@ define amdgpu_kernel void @private_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: private_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -8981,6 +11796,12 @@ define amdgpu_kernel void @private_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: private_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -8996,6 +11817,12 @@ define amdgpu_kernel void @private_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: private_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -9012,25 +11839,31 @@ define amdgpu_kernel void @private_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -9062,6 +11895,12 @@ define amdgpu_kernel void @private_workgroup_acquire_acquire_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -9086,6 +11925,12 @@ define amdgpu_kernel void @private_workgroup_acquire_acquire_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -9108,6 +11953,12 @@ define amdgpu_kernel void @private_workgroup_acquire_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -9127,6 +11978,12 @@ define amdgpu_kernel void @private_workgroup_acquire_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -9151,6 +12008,12 @@ define amdgpu_kernel void @private_workgroup_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -9173,6 +12036,12 @@ define amdgpu_kernel void @private_workgroup_acquire_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9193,6 +12062,12 @@ define amdgpu_kernel void @private_workgroup_acquire_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9212,6 +12087,12 @@ define amdgpu_kernel void @private_workgroup_acquire_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -9232,6 +12113,12 @@ define amdgpu_kernel void @private_workgroup_acquire_acquire_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -9251,6 +12138,12 @@ define amdgpu_kernel void @private_workgroup_acquire_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: private_workgroup_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -9267,6 +12160,12 @@ define amdgpu_kernel void @private_workgroup_acquire_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: private_workgroup_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -9283,6 +12182,12 @@ define amdgpu_kernel void @private_workgroup_acquire_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: private_workgroup_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -9298,6 +12203,12 @@ define amdgpu_kernel void @private_workgroup_acquire_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: private_workgroup_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -9314,25 +12225,31 @@ define amdgpu_kernel void @private_workgroup_acquire_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -9364,6 +12281,12 @@ define amdgpu_kernel void @private_workgroup_release_acquire_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -9388,6 +12311,12 @@ define amdgpu_kernel void @private_workgroup_release_acquire_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -9410,6 +12339,12 @@ define amdgpu_kernel void @private_workgroup_release_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -9429,6 +12364,12 @@ define amdgpu_kernel void @private_workgroup_release_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -9453,6 +12394,12 @@ define amdgpu_kernel void @private_workgroup_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -9475,6 +12422,12 @@ define amdgpu_kernel void @private_workgroup_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9495,6 +12448,12 @@ define amdgpu_kernel void @private_workgroup_release_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9514,6 +12473,12 @@ define amdgpu_kernel void @private_workgroup_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -9534,6 +12499,12 @@ define amdgpu_kernel void @private_workgroup_release_acquire_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -9553,6 +12524,12 @@ define amdgpu_kernel void @private_workgroup_release_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: private_workgroup_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -9569,6 +12546,12 @@ define amdgpu_kernel void @private_workgroup_release_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: private_workgroup_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -9585,6 +12568,12 @@ define amdgpu_kernel void @private_workgroup_release_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: private_workgroup_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -9600,6 +12589,12 @@ define amdgpu_kernel void @private_workgroup_release_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: private_workgroup_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -9616,25 +12611,31 @@ define amdgpu_kernel void @private_workgroup_release_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -9668,6 +12669,12 @@ define amdgpu_kernel void @private_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -9692,6 +12699,12 @@ define amdgpu_kernel void @private_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -9714,6 +12727,12 @@ define amdgpu_kernel void @private_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -9733,6 +12752,12 @@ define amdgpu_kernel void @private_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -9757,6 +12782,12 @@ define amdgpu_kernel void @private_workgroup_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -9779,6 +12810,12 @@ define amdgpu_kernel void @private_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9799,6 +12836,12 @@ define amdgpu_kernel void @private_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9818,6 +12861,12 @@ define amdgpu_kernel void @private_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -9838,6 +12887,12 @@ define amdgpu_kernel void @private_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -9857,6 +12912,12 @@ define amdgpu_kernel void @private_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: private_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -9873,6 +12934,12 @@ define amdgpu_kernel void @private_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: private_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -9889,6 +12956,12 @@ define amdgpu_kernel void @private_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: private_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -9904,6 +12977,12 @@ define amdgpu_kernel void @private_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: private_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -9920,25 +12999,31 @@ define amdgpu_kernel void @private_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -9972,6 +13057,12 @@ define amdgpu_kernel void @private_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -9996,6 +13087,12 @@ define amdgpu_kernel void @private_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -10018,6 +13115,12 @@ define amdgpu_kernel void @private_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10037,6 +13140,12 @@ define amdgpu_kernel void @private_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10061,6 +13170,12 @@ define amdgpu_kernel void @private_workgroup_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -10083,6 +13198,12 @@ define amdgpu_kernel void @private_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10103,6 +13224,12 @@ define amdgpu_kernel void @private_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10122,6 +13249,12 @@ define amdgpu_kernel void @private_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -10142,6 +13275,12 @@ define amdgpu_kernel void @private_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -10161,6 +13300,12 @@ define amdgpu_kernel void @private_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: private_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -10177,6 +13322,12 @@ define amdgpu_kernel void @private_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: private_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -10193,6 +13344,12 @@ define amdgpu_kernel void @private_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: private_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -10208,6 +13365,12 @@ define amdgpu_kernel void @private_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: private_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -10224,25 +13387,31 @@ define amdgpu_kernel void @private_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -10276,6 +13445,12 @@ define amdgpu_kernel void @private_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -10300,6 +13475,12 @@ define amdgpu_kernel void @private_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -10322,6 +13503,12 @@ define amdgpu_kernel void @private_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10341,6 +13528,12 @@ define amdgpu_kernel void @private_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10365,6 +13558,12 @@ define amdgpu_kernel void @private_workgroup_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -10387,6 +13586,12 @@ define amdgpu_kernel void @private_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10407,6 +13612,12 @@ define amdgpu_kernel void @private_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10426,6 +13637,12 @@ define amdgpu_kernel void @private_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -10446,6 +13663,12 @@ define amdgpu_kernel void @private_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -10465,6 +13688,12 @@ define amdgpu_kernel void @private_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: private_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -10481,6 +13710,12 @@ define amdgpu_kernel void @private_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: private_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -10497,6 +13732,12 @@ define amdgpu_kernel void @private_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: private_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -10512,6 +13753,12 @@ define amdgpu_kernel void @private_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: private_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -10528,25 +13775,31 @@ define amdgpu_kernel void @private_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -10580,6 +13833,12 @@ define amdgpu_kernel void @private_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -10604,6 +13863,12 @@ define amdgpu_kernel void @private_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -10626,6 +13891,12 @@ define amdgpu_kernel void @private_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10645,6 +13916,12 @@ define amdgpu_kernel void @private_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10669,6 +13946,12 @@ define amdgpu_kernel void @private_workgroup_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -10691,6 +13974,12 @@ define amdgpu_kernel void @private_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10711,6 +14000,12 @@ define amdgpu_kernel void @private_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10730,6 +14025,12 @@ define amdgpu_kernel void @private_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -10750,6 +14051,12 @@ define amdgpu_kernel void @private_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -10769,6 +14076,12 @@ define amdgpu_kernel void @private_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: private_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -10785,6 +14098,12 @@ define amdgpu_kernel void @private_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: private_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -10801,6 +14120,12 @@ define amdgpu_kernel void @private_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: private_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -10816,6 +14141,12 @@ define amdgpu_kernel void @private_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: private_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -10832,25 +14163,31 @@ define amdgpu_kernel void @private_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -10884,6 +14221,12 @@ define amdgpu_kernel void @private_workgroup_release_seq_cst_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -10908,6 +14251,12 @@ define amdgpu_kernel void @private_workgroup_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -10930,6 +14279,12 @@ define amdgpu_kernel void @private_workgroup_release_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10949,6 +14304,12 @@ define amdgpu_kernel void @private_workgroup_release_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10973,6 +14334,12 @@ define amdgpu_kernel void @private_workgroup_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -10995,6 +14362,12 @@ define amdgpu_kernel void @private_workgroup_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11015,6 +14388,12 @@ define amdgpu_kernel void @private_workgroup_release_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11034,6 +14413,12 @@ define amdgpu_kernel void @private_workgroup_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -11054,6 +14439,12 @@ define amdgpu_kernel void @private_workgroup_release_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -11073,6 +14464,12 @@ define amdgpu_kernel void @private_workgroup_release_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: private_workgroup_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -11089,6 +14486,12 @@ define amdgpu_kernel void @private_workgroup_release_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: private_workgroup_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -11105,6 +14508,12 @@ define amdgpu_kernel void @private_workgroup_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: private_workgroup_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -11120,6 +14529,12 @@ define amdgpu_kernel void @private_workgroup_release_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: private_workgroup_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -11136,25 +14551,31 @@ define amdgpu_kernel void @private_workgroup_release_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -11188,6 +14609,12 @@ define amdgpu_kernel void @private_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -11212,6 +14639,12 @@ define amdgpu_kernel void @private_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -11234,6 +14667,12 @@ define amdgpu_kernel void @private_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11253,6 +14692,12 @@ define amdgpu_kernel void @private_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11277,6 +14722,12 @@ define amdgpu_kernel void @private_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -11299,6 +14750,12 @@ define amdgpu_kernel void @private_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11319,6 +14776,12 @@ define amdgpu_kernel void @private_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11338,6 +14801,12 @@ define amdgpu_kernel void @private_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -11358,6 +14827,12 @@ define amdgpu_kernel void @private_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -11377,6 +14852,12 @@ define amdgpu_kernel void @private_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: private_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -11393,6 +14874,12 @@ define amdgpu_kernel void @private_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: private_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -11409,6 +14896,12 @@ define amdgpu_kernel void @private_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: private_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -11424,6 +14917,12 @@ define amdgpu_kernel void @private_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: private_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -11440,25 +14939,31 @@ define amdgpu_kernel void @private_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -11492,6 +14997,12 @@ define amdgpu_kernel void @private_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -11516,6 +15027,12 @@ define amdgpu_kernel void @private_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -11538,6 +15055,12 @@ define amdgpu_kernel void @private_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11557,6 +15080,12 @@ define amdgpu_kernel void @private_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11581,6 +15110,12 @@ define amdgpu_kernel void @private_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -11603,6 +15138,12 @@ define amdgpu_kernel void @private_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11623,6 +15164,12 @@ define amdgpu_kernel void @private_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11642,6 +15189,12 @@ define amdgpu_kernel void @private_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -11662,6 +15215,12 @@ define amdgpu_kernel void @private_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -11681,6 +15240,12 @@ define amdgpu_kernel void @private_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: private_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -11697,6 +15262,12 @@ define amdgpu_kernel void @private_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: private_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -11713,6 +15284,12 @@ define amdgpu_kernel void @private_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: private_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -11728,6 +15305,12 @@ define amdgpu_kernel void @private_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: private_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -11744,25 +15327,31 @@ define amdgpu_kernel void @private_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -11794,11 +15383,15 @@ define amdgpu_kernel void @private_workgroup_one_as_unordered_load( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -11808,11 +15401,15 @@ define amdgpu_kernel void @private_workgroup_one_as_unordered_load( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -11822,11 +15419,15 @@ define amdgpu_kernel void @private_workgroup_one_as_unordered_load( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -11836,11 +15437,15 @@ define amdgpu_kernel void @private_workgroup_one_as_unordered_load( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -11854,11 +15459,15 @@ define amdgpu_kernel void @private_workgroup_one_as_unordered_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -11868,11 +15477,15 @@ define amdgpu_kernel void @private_workgroup_one_as_unordered_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -11882,11 +15495,15 @@ define amdgpu_kernel void @private_workgroup_one_as_unordered_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -11894,84 +15511,108 @@ define amdgpu_kernel void @private_workgroup_one_as_unordered_load( ; ; GFX942-NOTTGSPLIT-LABEL: private_workgroup_one_as_unordered_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_workgroup_one_as_unordered_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_workgroup_one_as_unordered_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_workgroup_one_as_unordered_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_workgroup_one_as_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_workgroup_one_as_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: private_workgroup_one_as_unordered_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -11980,6 +15621,7 @@ define amdgpu_kernel void @private_workgroup_one_as_unordered_load( ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: flat_load_b32 v0, v[0:1] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %in, ptr addrspace(5) %out) { @@ -11994,11 +15636,15 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_load( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12008,11 +15654,15 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_load( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12022,11 +15672,15 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_load( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12036,11 +15690,15 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_load( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12054,11 +15712,15 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -12068,11 +15730,15 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12082,11 +15748,15 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12094,84 +15764,108 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_load( ; ; GFX942-NOTTGSPLIT-LABEL: private_workgroup_one_as_monotonic_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_workgroup_one_as_monotonic_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_workgroup_one_as_monotonic_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_workgroup_one_as_monotonic_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_workgroup_one_as_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_workgroup_one_as_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: private_workgroup_one_as_monotonic_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -12180,6 +15874,7 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_load( ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: flat_load_b32 v0, v[0:1] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %in, ptr addrspace(5) %out) { @@ -12194,11 +15889,15 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_load( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12208,11 +15907,15 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_load( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12222,11 +15925,15 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_load( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12236,11 +15943,15 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_load( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12254,11 +15965,15 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -12268,11 +15983,15 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12282,11 +16001,15 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12294,84 +16017,108 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_load( ; ; GFX942-NOTTGSPLIT-LABEL: private_workgroup_one_as_acquire_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_workgroup_one_as_acquire_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_workgroup_one_as_acquire_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_workgroup_one_as_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_workgroup_one_as_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_workgroup_one_as_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: private_workgroup_one_as_acquire_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -12380,6 +16127,7 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_load( ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: flat_load_b32 v0, v[0:1] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %in, ptr addrspace(5) %out) { @@ -12394,11 +16142,15 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_load( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12408,11 +16160,15 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_load( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12422,11 +16178,15 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_load( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12436,11 +16196,15 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_load( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12454,11 +16218,15 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -12468,11 +16236,15 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_load( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12482,11 +16254,15 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_load( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -12494,84 +16270,108 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_load( ; ; GFX942-NOTTGSPLIT-LABEL: private_workgroup_one_as_seq_cst_load: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_workgroup_one_as_seq_cst_load: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1 -; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_workgroup_one_as_seq_cst_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_workgroup_one_as_seq_cst_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_workgroup_one_as_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_workgroup_one_as_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: private_workgroup_one_as_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -12582,6 +16382,7 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_load( ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_load_b32 v0, v[0:1] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %in, ptr addrspace(5) %out) { @@ -12596,10 +16397,14 @@ define amdgpu_kernel void @private_workgroup_one_as_unordered_store( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX6-NEXT: s_endpgm @@ -12608,10 +16413,14 @@ define amdgpu_kernel void @private_workgroup_one_as_unordered_store( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX7-NEXT: s_endpgm @@ -12620,10 +16429,14 @@ define amdgpu_kernel void @private_workgroup_one_as_unordered_store( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-WGP-NEXT: s_endpgm @@ -12632,10 +16445,14 @@ define amdgpu_kernel void @private_workgroup_one_as_unordered_store( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-CU-NEXT: s_endpgm @@ -12648,10 +16465,14 @@ define amdgpu_kernel void @private_workgroup_one_as_unordered_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -12660,10 +16481,14 @@ define amdgpu_kernel void @private_workgroup_one_as_unordered_store( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -12672,65 +16497,93 @@ define amdgpu_kernel void @private_workgroup_one_as_unordered_store( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: private_workgroup_one_as_unordered_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_workgroup_one_as_unordered_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_workgroup_one_as_unordered_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_workgroup_one_as_unordered_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_workgroup_one_as_unordered_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_workgroup_one_as_unordered_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; @@ -12738,22 +16591,26 @@ define amdgpu_kernel void @private_workgroup_one_as_unordered_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -12775,10 +16632,14 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_store( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX6-NEXT: s_endpgm @@ -12787,10 +16648,14 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_store( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX7-NEXT: s_endpgm @@ -12799,10 +16664,14 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_store( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-WGP-NEXT: s_endpgm @@ -12811,10 +16680,14 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_store( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-CU-NEXT: s_endpgm @@ -12827,10 +16700,14 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -12839,10 +16716,14 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_store( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -12851,65 +16732,93 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_store( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: private_workgroup_one_as_monotonic_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_workgroup_one_as_monotonic_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_workgroup_one_as_monotonic_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_workgroup_one_as_monotonic_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_workgroup_one_as_monotonic_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_workgroup_one_as_monotonic_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; @@ -12917,22 +16826,26 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -12954,10 +16867,14 @@ define amdgpu_kernel void @private_workgroup_one_as_release_store( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX6-NEXT: s_endpgm @@ -12966,10 +16883,14 @@ define amdgpu_kernel void @private_workgroup_one_as_release_store( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX7-NEXT: s_endpgm @@ -12978,10 +16899,14 @@ define amdgpu_kernel void @private_workgroup_one_as_release_store( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-WGP-NEXT: s_endpgm @@ -12990,10 +16915,14 @@ define amdgpu_kernel void @private_workgroup_one_as_release_store( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-CU-NEXT: s_endpgm @@ -13006,10 +16935,14 @@ define amdgpu_kernel void @private_workgroup_one_as_release_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -13018,10 +16951,14 @@ define amdgpu_kernel void @private_workgroup_one_as_release_store( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -13030,65 +16967,93 @@ define amdgpu_kernel void @private_workgroup_one_as_release_store( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: private_workgroup_one_as_release_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_workgroup_one_as_release_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_workgroup_one_as_release_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_workgroup_one_as_release_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_workgroup_one_as_release_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_workgroup_one_as_release_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; @@ -13096,22 +17061,26 @@ define amdgpu_kernel void @private_workgroup_one_as_release_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -13135,10 +17104,14 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_store( ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX6-NEXT: s_endpgm @@ -13147,10 +17120,14 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_store( ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX7-NEXT: s_endpgm @@ -13159,10 +17136,14 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_store( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-WGP-NEXT: s_endpgm @@ -13171,10 +17152,14 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_store( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-CU-NEXT: s_endpgm @@ -13187,10 +17172,14 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -13199,10 +17188,14 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_store( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -13211,65 +17204,93 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_store( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: private_workgroup_one_as_seq_cst_store: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: private_workgroup_one_as_seq_cst_store: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: private_workgroup_one_as_seq_cst_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_workgroup_one_as_seq_cst_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: private_workgroup_one_as_seq_cst_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_workgroup_one_as_seq_cst_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm ; @@ -13277,22 +17298,26 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_store( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 @@ -13317,8 +17342,15 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13329,8 +17361,15 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13341,8 +17380,15 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13353,8 +17399,15 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13369,8 +17422,15 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -13381,8 +17441,15 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13393,8 +17460,15 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13403,8 +17477,14 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_workgroup_one_as_monotonic_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -13412,8 +17492,14 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_workgroup_one_as_monotonic_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm @@ -13421,8 +17507,14 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_atomicrmw( ; GFX11-WGP-LABEL: private_workgroup_one_as_monotonic_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm @@ -13430,8 +17522,14 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_atomicrmw( ; GFX11-CU-LABEL: private_workgroup_one_as_monotonic_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm @@ -13439,8 +17537,14 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_atomicrmw( ; GFX12-WGP-LABEL: private_workgroup_one_as_monotonic_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm @@ -13448,8 +17552,14 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_atomicrmw( ; GFX12-CU-LABEL: private_workgroup_one_as_monotonic_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm @@ -13457,29 +17567,34 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_atomicrmw( ; GFX1250-LABEL: private_workgroup_one_as_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 ; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 @@ -13496,8 +17611,15 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13508,8 +17630,15 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13520,8 +17649,15 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13532,8 +17668,15 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13548,8 +17691,15 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -13560,8 +17710,15 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13572,8 +17729,15 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13582,8 +17746,14 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_workgroup_one_as_acquire_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -13591,8 +17761,14 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_workgroup_one_as_acquire_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm @@ -13600,8 +17776,14 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_atomicrmw( ; GFX11-WGP-LABEL: private_workgroup_one_as_acquire_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm @@ -13609,8 +17791,14 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_atomicrmw( ; GFX11-CU-LABEL: private_workgroup_one_as_acquire_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm @@ -13618,8 +17806,14 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_atomicrmw( ; GFX12-WGP-LABEL: private_workgroup_one_as_acquire_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm @@ -13627,8 +17821,14 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_atomicrmw( ; GFX12-CU-LABEL: private_workgroup_one_as_acquire_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm @@ -13636,29 +17836,34 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_atomicrmw( ; GFX1250-LABEL: private_workgroup_one_as_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 ; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 @@ -13676,8 +17881,15 @@ define amdgpu_kernel void @private_workgroup_one_as_release_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13688,8 +17900,15 @@ define amdgpu_kernel void @private_workgroup_one_as_release_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13700,8 +17919,15 @@ define amdgpu_kernel void @private_workgroup_one_as_release_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13712,8 +17938,15 @@ define amdgpu_kernel void @private_workgroup_one_as_release_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13728,8 +17961,15 @@ define amdgpu_kernel void @private_workgroup_one_as_release_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -13740,8 +17980,15 @@ define amdgpu_kernel void @private_workgroup_one_as_release_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13752,8 +17999,15 @@ define amdgpu_kernel void @private_workgroup_one_as_release_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13762,8 +18016,14 @@ define amdgpu_kernel void @private_workgroup_one_as_release_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_workgroup_one_as_release_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -13771,8 +18031,14 @@ define amdgpu_kernel void @private_workgroup_one_as_release_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_workgroup_one_as_release_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm @@ -13780,8 +18046,14 @@ define amdgpu_kernel void @private_workgroup_one_as_release_atomicrmw( ; GFX11-WGP-LABEL: private_workgroup_one_as_release_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm @@ -13789,8 +18061,14 @@ define amdgpu_kernel void @private_workgroup_one_as_release_atomicrmw( ; GFX11-CU-LABEL: private_workgroup_one_as_release_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm @@ -13798,8 +18076,14 @@ define amdgpu_kernel void @private_workgroup_one_as_release_atomicrmw( ; GFX12-WGP-LABEL: private_workgroup_one_as_release_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm @@ -13807,8 +18091,14 @@ define amdgpu_kernel void @private_workgroup_one_as_release_atomicrmw( ; GFX12-CU-LABEL: private_workgroup_one_as_release_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm @@ -13816,29 +18106,34 @@ define amdgpu_kernel void @private_workgroup_one_as_release_atomicrmw( ; GFX1250-LABEL: private_workgroup_one_as_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 ; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -13857,8 +18152,15 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13869,8 +18171,15 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13881,8 +18190,15 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13893,8 +18209,15 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13909,8 +18232,15 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -13921,8 +18251,15 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13933,8 +18270,15 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -13943,8 +18287,14 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_workgroup_one_as_acq_rel_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -13952,8 +18302,14 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_workgroup_one_as_acq_rel_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm @@ -13961,8 +18317,14 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_atomicrmw( ; GFX11-WGP-LABEL: private_workgroup_one_as_acq_rel_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm @@ -13970,8 +18332,14 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_atomicrmw( ; GFX11-CU-LABEL: private_workgroup_one_as_acq_rel_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm @@ -13979,8 +18347,14 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_atomicrmw( ; GFX12-WGP-LABEL: private_workgroup_one_as_acq_rel_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm @@ -13988,8 +18362,14 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_atomicrmw( ; GFX12-CU-LABEL: private_workgroup_one_as_acq_rel_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm @@ -13997,29 +18377,34 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_atomicrmw( ; GFX1250-LABEL: private_workgroup_one_as_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 ; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -14039,8 +18424,15 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -14051,8 +18443,15 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -14063,8 +18462,15 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -14075,8 +18481,15 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -14091,8 +18504,15 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen @@ -14103,8 +18523,15 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -14115,8 +18542,15 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -14125,8 +18559,14 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_workgroup_one_as_seq_cst_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -14134,8 +18574,14 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_workgroup_one_as_seq_cst_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 ; GFX942-TGSPLIT-NEXT: s_endpgm @@ -14143,8 +18589,14 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_atomicrmw( ; GFX11-WGP-LABEL: private_workgroup_one_as_seq_cst_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-WGP-NEXT: s_endpgm @@ -14152,8 +18604,14 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_atomicrmw( ; GFX11-CU-LABEL: private_workgroup_one_as_seq_cst_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-CU-NEXT: s_endpgm @@ -14161,8 +18619,14 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_atomicrmw( ; GFX12-WGP-LABEL: private_workgroup_one_as_seq_cst_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-WGP-NEXT: s_endpgm @@ -14170,8 +18634,14 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_atomicrmw( ; GFX12-CU-LABEL: private_workgroup_one_as_seq_cst_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 ; GFX12-CU-NEXT: s_endpgm @@ -14179,29 +18649,34 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_atomicrmw( ; GFX1250-LABEL: private_workgroup_one_as_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 -; GFX1250-NEXT: s_mov_b32 s1, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s1, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s1, s4 ; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -14221,6 +18696,10 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_ret_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -14239,6 +18718,10 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_ret_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -14256,6 +18739,10 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_ret_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -14273,6 +18760,10 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_ret_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -14294,6 +18785,10 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -14311,6 +18806,10 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -14328,6 +18827,10 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -14343,6 +18846,10 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_workgroup_one_as_acquire_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 @@ -14355,6 +18862,10 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_workgroup_one_as_acquire_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 @@ -14367,6 +18878,10 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_ret_atomicrmw( ; GFX11-WGP-LABEL: private_workgroup_one_as_acquire_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 @@ -14379,6 +18894,10 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_ret_atomicrmw( ; GFX11-CU-LABEL: private_workgroup_one_as_acquire_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 @@ -14391,6 +18910,10 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_ret_atomicrmw( ; GFX12-WGP-LABEL: private_workgroup_one_as_acquire_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 @@ -14403,6 +18926,10 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_ret_atomicrmw( ; GFX12-CU-LABEL: private_workgroup_one_as_acquire_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 @@ -14416,22 +18943,26 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_ret_atomicrmw( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_lg_u32 s0, s2 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b32 s2, 0 ; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s0, s3 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -14457,6 +18988,10 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -14475,6 +19010,10 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -14492,6 +19031,10 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -14509,6 +19052,10 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -14530,6 +19077,10 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -14547,6 +19098,10 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -14564,6 +19119,10 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -14579,6 +19138,10 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 @@ -14591,6 +19154,10 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 @@ -14603,6 +19170,10 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX11-WGP-LABEL: private_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 @@ -14615,6 +19186,10 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX11-CU-LABEL: private_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 @@ -14627,6 +19202,10 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX12-WGP-LABEL: private_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 @@ -14639,6 +19218,10 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-LABEL: private_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 @@ -14652,22 +19235,26 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_lg_u32 s0, s2 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b32 s2, 0 ; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s0, s3 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -14695,6 +19282,10 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -14713,6 +19304,10 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -14730,6 +19325,10 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 @@ -14747,6 +19346,10 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 @@ -14768,6 +19371,10 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -14785,6 +19392,10 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -14802,6 +19413,10 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 @@ -14817,6 +19432,10 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-LABEL: private_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 @@ -14829,6 +19448,10 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX942-TGSPLIT-LABEL: private_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 @@ -14841,6 +19464,10 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX11-WGP-LABEL: private_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 @@ -14853,6 +19480,10 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX11-CU-LABEL: private_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 @@ -14865,6 +19496,10 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX12-WGP-LABEL: private_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 @@ -14877,6 +19512,10 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-LABEL: private_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 @@ -14890,22 +19529,26 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_lg_u32 s0, s2 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b32 s2, 0 ; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s3, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s0, s3 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -14933,6 +19576,12 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_monotonic_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -14953,6 +19602,12 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_monotonic_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -14973,6 +19628,12 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -14990,6 +19651,12 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -15011,6 +19678,12 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -15031,6 +19704,12 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15049,6 +19728,12 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15065,6 +19750,12 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -15082,6 +19773,12 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -15099,6 +19796,12 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_monotonic_cmpxchg( ; GFX11-WGP-LABEL: private_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -15114,6 +19817,12 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_monotonic_cmpxchg( ; GFX11-CU-LABEL: private_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -15129,6 +19838,12 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_monotonic_cmpxchg( ; GFX12-WGP-LABEL: private_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -15143,6 +19858,12 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-LABEL: private_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -15157,26 +19878,32 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_monotonic_cmpxchg( ; GFX1250-LABEL: private_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -15184,6 +19911,7 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_monotonic_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -15203,6 +19931,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -15223,6 +19957,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -15243,6 +19983,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -15260,6 +20006,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -15281,6 +20033,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -15301,6 +20059,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15319,6 +20083,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15335,6 +20105,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -15352,6 +20128,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -15369,6 +20151,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX11-WGP-LABEL: private_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -15384,6 +20172,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX11-CU-LABEL: private_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -15399,6 +20193,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX12-WGP-LABEL: private_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -15413,6 +20213,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-LABEL: private_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -15427,26 +20233,32 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX1250-LABEL: private_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -15454,6 +20266,7 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -15474,6 +20287,12 @@ define amdgpu_kernel void @private_workgroup_one_as_release_monotonic_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -15494,6 +20313,12 @@ define amdgpu_kernel void @private_workgroup_one_as_release_monotonic_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -15514,6 +20339,12 @@ define amdgpu_kernel void @private_workgroup_one_as_release_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -15531,6 +20362,12 @@ define amdgpu_kernel void @private_workgroup_one_as_release_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -15552,6 +20389,12 @@ define amdgpu_kernel void @private_workgroup_one_as_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -15572,6 +20415,12 @@ define amdgpu_kernel void @private_workgroup_one_as_release_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15590,6 +20439,12 @@ define amdgpu_kernel void @private_workgroup_one_as_release_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15606,6 +20461,12 @@ define amdgpu_kernel void @private_workgroup_one_as_release_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_workgroup_one_as_release_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -15623,6 +20484,12 @@ define amdgpu_kernel void @private_workgroup_one_as_release_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_workgroup_one_as_release_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -15640,6 +20507,12 @@ define amdgpu_kernel void @private_workgroup_one_as_release_monotonic_cmpxchg( ; GFX11-WGP-LABEL: private_workgroup_one_as_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -15655,6 +20528,12 @@ define amdgpu_kernel void @private_workgroup_one_as_release_monotonic_cmpxchg( ; GFX11-CU-LABEL: private_workgroup_one_as_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -15670,6 +20549,12 @@ define amdgpu_kernel void @private_workgroup_one_as_release_monotonic_cmpxchg( ; GFX12-WGP-LABEL: private_workgroup_one_as_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -15684,6 +20569,12 @@ define amdgpu_kernel void @private_workgroup_one_as_release_monotonic_cmpxchg( ; GFX12-CU-LABEL: private_workgroup_one_as_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -15698,26 +20589,32 @@ define amdgpu_kernel void @private_workgroup_one_as_release_monotonic_cmpxchg( ; GFX1250-LABEL: private_workgroup_one_as_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -15725,6 +20622,7 @@ define amdgpu_kernel void @private_workgroup_one_as_release_monotonic_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -15746,6 +20644,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -15766,6 +20670,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -15786,6 +20696,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -15803,6 +20719,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -15824,6 +20746,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -15844,6 +20772,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15862,6 +20796,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -15878,6 +20818,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -15895,6 +20841,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -15912,6 +20864,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX11-WGP-LABEL: private_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -15927,6 +20885,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX11-CU-LABEL: private_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -15942,6 +20906,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-WGP-LABEL: private_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -15956,6 +20926,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-LABEL: private_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -15970,26 +20946,32 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX1250-LABEL: private_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -15997,6 +20979,7 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -16019,6 +21002,12 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -16039,6 +21028,12 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -16059,6 +21054,12 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16076,6 +21077,12 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16097,6 +21104,12 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -16117,6 +21130,12 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16135,6 +21154,12 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16151,6 +21176,12 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -16168,6 +21199,12 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -16185,6 +21222,12 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX11-WGP-LABEL: private_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -16200,6 +21243,12 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX11-CU-LABEL: private_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -16215,6 +21264,12 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-WGP-LABEL: private_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -16229,6 +21284,12 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-LABEL: private_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -16243,26 +21304,32 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX1250-LABEL: private_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -16270,6 +21337,7 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -16292,6 +21360,12 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -16312,6 +21386,12 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -16332,6 +21412,12 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16349,6 +21435,12 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16370,6 +21462,12 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -16390,6 +21488,12 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16408,6 +21512,12 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16424,6 +21534,12 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -16441,6 +21557,12 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -16458,6 +21580,12 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX11-WGP-LABEL: private_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -16473,6 +21601,12 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX11-CU-LABEL: private_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -16488,6 +21622,12 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX12-WGP-LABEL: private_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -16502,6 +21642,12 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-LABEL: private_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -16516,26 +21662,32 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX1250-LABEL: private_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -16543,6 +21695,7 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -16563,6 +21716,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -16583,6 +21742,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -16603,6 +21768,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16620,6 +21791,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16641,6 +21818,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -16661,6 +21844,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16679,6 +21868,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16695,6 +21890,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -16712,6 +21913,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -16729,6 +21936,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX11-WGP-LABEL: private_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -16744,6 +21957,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX11-CU-LABEL: private_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -16759,6 +21978,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX12-WGP-LABEL: private_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -16773,6 +21998,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-LABEL: private_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -16787,26 +22018,32 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX1250-LABEL: private_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -16814,6 +22051,7 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -16834,6 +22072,12 @@ define amdgpu_kernel void @private_workgroup_one_as_release_acquire_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -16854,6 +22098,12 @@ define amdgpu_kernel void @private_workgroup_one_as_release_acquire_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -16874,6 +22124,12 @@ define amdgpu_kernel void @private_workgroup_one_as_release_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16891,6 +22147,12 @@ define amdgpu_kernel void @private_workgroup_one_as_release_acquire_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16912,6 +22174,12 @@ define amdgpu_kernel void @private_workgroup_one_as_release_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -16932,6 +22200,12 @@ define amdgpu_kernel void @private_workgroup_one_as_release_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16950,6 +22224,12 @@ define amdgpu_kernel void @private_workgroup_one_as_release_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16966,6 +22246,12 @@ define amdgpu_kernel void @private_workgroup_one_as_release_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_workgroup_one_as_release_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -16983,6 +22269,12 @@ define amdgpu_kernel void @private_workgroup_one_as_release_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_workgroup_one_as_release_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -17000,6 +22292,12 @@ define amdgpu_kernel void @private_workgroup_one_as_release_acquire_cmpxchg( ; GFX11-WGP-LABEL: private_workgroup_one_as_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -17015,6 +22313,12 @@ define amdgpu_kernel void @private_workgroup_one_as_release_acquire_cmpxchg( ; GFX11-CU-LABEL: private_workgroup_one_as_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -17030,6 +22334,12 @@ define amdgpu_kernel void @private_workgroup_one_as_release_acquire_cmpxchg( ; GFX12-WGP-LABEL: private_workgroup_one_as_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -17044,6 +22354,12 @@ define amdgpu_kernel void @private_workgroup_one_as_release_acquire_cmpxchg( ; GFX12-CU-LABEL: private_workgroup_one_as_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -17058,26 +22374,32 @@ define amdgpu_kernel void @private_workgroup_one_as_release_acquire_cmpxchg( ; GFX1250-LABEL: private_workgroup_one_as_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -17085,6 +22407,7 @@ define amdgpu_kernel void @private_workgroup_one_as_release_acquire_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -17107,6 +22430,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -17127,6 +22456,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -17147,6 +22482,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17164,6 +22505,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17185,6 +22532,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -17205,6 +22558,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17223,6 +22582,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17239,6 +22604,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -17256,6 +22627,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -17273,6 +22650,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX11-WGP-LABEL: private_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -17288,6 +22671,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX11-CU-LABEL: private_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -17303,6 +22692,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX12-WGP-LABEL: private_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -17317,6 +22712,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-LABEL: private_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -17331,26 +22732,32 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX1250-LABEL: private_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -17358,6 +22765,7 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -17380,6 +22788,12 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -17400,6 +22814,12 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -17420,6 +22840,12 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17437,6 +22863,12 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17458,6 +22890,12 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -17478,6 +22916,12 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17496,6 +22940,12 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17512,6 +22962,12 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -17529,6 +22985,12 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -17546,6 +23008,12 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX11-WGP-LABEL: private_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -17561,6 +23029,12 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX11-CU-LABEL: private_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -17576,6 +23050,12 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX12-WGP-LABEL: private_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -17590,6 +23070,12 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-LABEL: private_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -17604,26 +23090,32 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX1250-LABEL: private_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -17631,6 +23123,7 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -17653,6 +23146,12 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -17673,6 +23172,12 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -17693,6 +23198,12 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17710,6 +23221,12 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -17731,6 +23248,12 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -17751,6 +23274,12 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17769,6 +23298,12 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -17785,6 +23320,12 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -17802,6 +23343,12 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -17819,6 +23366,12 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: private_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -17834,6 +23387,12 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX11-CU-LABEL: private_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -17849,6 +23408,12 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: private_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -17863,6 +23428,12 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-LABEL: private_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -17877,26 +23448,32 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX1250-LABEL: private_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -17904,6 +23481,7 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -17926,6 +23504,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -17946,6 +23530,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -17966,6 +23556,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -17983,6 +23579,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -18004,6 +23606,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -18024,6 +23632,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18042,6 +23656,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18058,6 +23678,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -18075,6 +23701,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -18092,6 +23724,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: private_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -18107,6 +23745,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX11-CU-LABEL: private_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -18122,6 +23766,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: private_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -18136,6 +23786,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-LABEL: private_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -18150,26 +23806,32 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX1250-LABEL: private_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -18177,6 +23839,7 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -18199,6 +23862,12 @@ define amdgpu_kernel void @private_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -18219,6 +23888,12 @@ define amdgpu_kernel void @private_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -18239,6 +23914,12 @@ define amdgpu_kernel void @private_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -18256,6 +23937,12 @@ define amdgpu_kernel void @private_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -18277,6 +23964,12 @@ define amdgpu_kernel void @private_workgroup_one_as_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -18297,6 +23990,12 @@ define amdgpu_kernel void @private_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18315,6 +24014,12 @@ define amdgpu_kernel void @private_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18331,6 +24036,12 @@ define amdgpu_kernel void @private_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -18348,6 +24059,12 @@ define amdgpu_kernel void @private_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -18365,6 +24082,12 @@ define amdgpu_kernel void @private_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: private_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -18380,6 +24103,12 @@ define amdgpu_kernel void @private_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX11-CU-LABEL: private_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -18395,6 +24124,12 @@ define amdgpu_kernel void @private_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: private_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -18409,6 +24144,12 @@ define amdgpu_kernel void @private_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-LABEL: private_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -18423,26 +24164,32 @@ define amdgpu_kernel void @private_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX1250-LABEL: private_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -18450,6 +24197,7 @@ define amdgpu_kernel void @private_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -18472,6 +24220,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -18492,6 +24246,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -18512,6 +24272,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -18529,6 +24295,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -18550,6 +24322,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -18570,6 +24348,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18588,6 +24372,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18604,6 +24394,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -18621,6 +24417,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -18638,6 +24440,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: private_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -18653,6 +24461,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX11-CU-LABEL: private_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -18668,6 +24482,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: private_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -18682,6 +24502,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-LABEL: private_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -18696,26 +24522,32 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX1250-LABEL: private_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -18723,6 +24555,7 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -18745,6 +24578,12 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 s7, 16 @@ -18765,6 +24604,12 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 16 @@ -18785,6 +24630,12 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -18802,6 +24653,12 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -18823,6 +24680,12 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 @@ -18843,6 +24706,12 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18861,6 +24730,12 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -18877,6 +24752,12 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-LABEL: private_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -18894,6 +24775,12 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX942-TGSPLIT-LABEL: private_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 @@ -18911,6 +24798,12 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX11-WGP-LABEL: private_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s3, 16 @@ -18926,6 +24819,12 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX11-CU-LABEL: private_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s3, 16 @@ -18941,6 +24840,12 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-WGP-LABEL: private_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -18955,6 +24860,12 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-LABEL: private_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -18969,26 +24880,32 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX1250-LABEL: private_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0 ; GFX1250-NEXT: s_mov_b32 s2, s5 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s2, s4 @@ -18996,6 +24913,7 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 @@ -19019,6 +24937,12 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_monotonic_ret_cmpx ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -19043,6 +24967,12 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_monotonic_ret_cmpx ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -19065,6 +24995,12 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_monotonic_ret_cmpx ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -19084,6 +25020,12 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_monotonic_ret_cmpx ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -19108,6 +25050,12 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_monotonic_ret_cmpx ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -19130,6 +25078,12 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_monotonic_ret_cmpx ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19150,6 +25104,12 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_monotonic_ret_cmpx ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19169,6 +25129,12 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_monotonic_ret_cmpx ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -19189,6 +25155,12 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_monotonic_ret_cmpx ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -19208,6 +25180,12 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_monotonic_ret_cmpx ; GFX11-WGP-LABEL: private_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -19224,6 +25202,12 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_monotonic_ret_cmpx ; GFX11-CU-LABEL: private_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -19240,6 +25224,12 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_monotonic_ret_cmpx ; GFX12-WGP-LABEL: private_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -19255,6 +25245,12 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_monotonic_ret_cmpx ; GFX12-CU-LABEL: private_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -19271,25 +25267,31 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_monotonic_ret_cmpx ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -19321,6 +25323,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_monotonic_ret_cmpxch ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -19345,6 +25353,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_monotonic_ret_cmpxch ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -19367,6 +25381,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_monotonic_ret_cmpxch ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -19386,6 +25406,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_monotonic_ret_cmpxch ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -19410,6 +25436,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_monotonic_ret_cmpxch ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -19432,6 +25464,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_monotonic_ret_cmpxch ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19452,6 +25490,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_monotonic_ret_cmpxch ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19471,6 +25515,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_monotonic_ret_cmpxch ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -19491,6 +25541,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_monotonic_ret_cmpxch ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -19510,6 +25566,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_monotonic_ret_cmpxch ; GFX11-WGP-LABEL: private_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -19526,6 +25588,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_monotonic_ret_cmpxch ; GFX11-CU-LABEL: private_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -19542,6 +25610,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_monotonic_ret_cmpxch ; GFX12-WGP-LABEL: private_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -19557,6 +25631,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_monotonic_ret_cmpxch ; GFX12-CU-LABEL: private_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -19573,25 +25653,31 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_monotonic_ret_cmpxch ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -19623,6 +25709,12 @@ define amdgpu_kernel void @private_workgroup_one_as_release_monotonic_ret_cmpxch ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -19647,6 +25739,12 @@ define amdgpu_kernel void @private_workgroup_one_as_release_monotonic_ret_cmpxch ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -19669,6 +25767,12 @@ define amdgpu_kernel void @private_workgroup_one_as_release_monotonic_ret_cmpxch ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -19688,6 +25792,12 @@ define amdgpu_kernel void @private_workgroup_one_as_release_monotonic_ret_cmpxch ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -19712,6 +25822,12 @@ define amdgpu_kernel void @private_workgroup_one_as_release_monotonic_ret_cmpxch ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -19734,6 +25850,12 @@ define amdgpu_kernel void @private_workgroup_one_as_release_monotonic_ret_cmpxch ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19754,6 +25876,12 @@ define amdgpu_kernel void @private_workgroup_one_as_release_monotonic_ret_cmpxch ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -19773,6 +25901,12 @@ define amdgpu_kernel void @private_workgroup_one_as_release_monotonic_ret_cmpxch ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -19793,6 +25927,12 @@ define amdgpu_kernel void @private_workgroup_one_as_release_monotonic_ret_cmpxch ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -19812,6 +25952,12 @@ define amdgpu_kernel void @private_workgroup_one_as_release_monotonic_ret_cmpxch ; GFX11-WGP-LABEL: private_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -19828,6 +25974,12 @@ define amdgpu_kernel void @private_workgroup_one_as_release_monotonic_ret_cmpxch ; GFX11-CU-LABEL: private_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -19844,6 +25996,12 @@ define amdgpu_kernel void @private_workgroup_one_as_release_monotonic_ret_cmpxch ; GFX12-WGP-LABEL: private_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -19859,6 +26017,12 @@ define amdgpu_kernel void @private_workgroup_one_as_release_monotonic_ret_cmpxch ; GFX12-CU-LABEL: private_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -19875,25 +26039,31 @@ define amdgpu_kernel void @private_workgroup_one_as_release_monotonic_ret_cmpxch ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -19927,6 +26097,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_monotonic_ret_cmpxch ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -19951,6 +26127,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_monotonic_ret_cmpxch ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -19973,6 +26155,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_monotonic_ret_cmpxch ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -19992,6 +26180,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_monotonic_ret_cmpxch ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -20016,6 +26210,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_monotonic_ret_cmpxch ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -20038,6 +26238,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_monotonic_ret_cmpxch ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20058,6 +26264,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_monotonic_ret_cmpxch ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20077,6 +26289,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_monotonic_ret_cmpxch ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -20097,6 +26315,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_monotonic_ret_cmpxch ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -20116,6 +26340,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_monotonic_ret_cmpxch ; GFX11-WGP-LABEL: private_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -20132,6 +26362,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_monotonic_ret_cmpxch ; GFX11-CU-LABEL: private_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -20148,6 +26384,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_monotonic_ret_cmpxch ; GFX12-WGP-LABEL: private_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -20163,6 +26405,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_monotonic_ret_cmpxch ; GFX12-CU-LABEL: private_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -20179,25 +26427,31 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_monotonic_ret_cmpxch ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -20231,6 +26485,12 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_monotonic_ret_cmpxch ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -20255,6 +26515,12 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_monotonic_ret_cmpxch ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -20277,6 +26543,12 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_monotonic_ret_cmpxch ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -20296,6 +26568,12 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_monotonic_ret_cmpxch ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -20320,6 +26598,12 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_monotonic_ret_cmpxch ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -20342,6 +26626,12 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_monotonic_ret_cmpxch ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20362,6 +26652,12 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_monotonic_ret_cmpxch ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20381,6 +26677,12 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_monotonic_ret_cmpxch ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -20401,6 +26703,12 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_monotonic_ret_cmpxch ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -20420,6 +26728,12 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_monotonic_ret_cmpxch ; GFX11-WGP-LABEL: private_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -20436,6 +26750,12 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_monotonic_ret_cmpxch ; GFX11-CU-LABEL: private_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -20452,6 +26772,12 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_monotonic_ret_cmpxch ; GFX12-WGP-LABEL: private_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -20467,6 +26793,12 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_monotonic_ret_cmpxch ; GFX12-CU-LABEL: private_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -20483,25 +26815,31 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_monotonic_ret_cmpxch ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -20535,6 +26873,12 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_acquire_ret_cmpxch ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -20559,6 +26903,12 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_acquire_ret_cmpxch ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -20581,6 +26931,12 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_acquire_ret_cmpxch ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -20600,6 +26956,12 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_acquire_ret_cmpxch ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -20624,6 +26986,12 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_acquire_ret_cmpxch ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -20646,6 +27014,12 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_acquire_ret_cmpxch ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20666,6 +27040,12 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_acquire_ret_cmpxch ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20685,6 +27065,12 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_acquire_ret_cmpxch ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -20705,6 +27091,12 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_acquire_ret_cmpxch ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -20724,6 +27116,12 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_acquire_ret_cmpxch ; GFX11-WGP-LABEL: private_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -20740,6 +27138,12 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_acquire_ret_cmpxch ; GFX11-CU-LABEL: private_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -20756,6 +27160,12 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_acquire_ret_cmpxch ; GFX12-WGP-LABEL: private_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -20771,6 +27181,12 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_acquire_ret_cmpxch ; GFX12-CU-LABEL: private_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -20787,25 +27203,31 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_acquire_ret_cmpxch ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -20837,6 +27259,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -20861,6 +27289,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -20883,6 +27317,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -20902,6 +27342,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -20926,6 +27372,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -20948,6 +27400,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20968,6 +27426,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -20987,6 +27451,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -21007,6 +27477,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -21026,6 +27502,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: private_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -21042,6 +27524,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: private_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -21058,6 +27546,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: private_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -21073,6 +27567,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: private_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -21089,25 +27589,31 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -21139,6 +27645,12 @@ define amdgpu_kernel void @private_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -21163,6 +27675,12 @@ define amdgpu_kernel void @private_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -21185,6 +27703,12 @@ define amdgpu_kernel void @private_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -21204,6 +27728,12 @@ define amdgpu_kernel void @private_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -21228,6 +27758,12 @@ define amdgpu_kernel void @private_workgroup_one_as_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -21250,6 +27786,12 @@ define amdgpu_kernel void @private_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21270,6 +27812,12 @@ define amdgpu_kernel void @private_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21289,6 +27837,12 @@ define amdgpu_kernel void @private_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -21309,6 +27863,12 @@ define amdgpu_kernel void @private_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -21328,6 +27888,12 @@ define amdgpu_kernel void @private_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: private_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -21344,6 +27910,12 @@ define amdgpu_kernel void @private_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: private_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -21360,6 +27932,12 @@ define amdgpu_kernel void @private_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: private_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -21375,6 +27953,12 @@ define amdgpu_kernel void @private_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: private_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -21391,25 +27975,31 @@ define amdgpu_kernel void @private_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -21443,6 +28033,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -21467,6 +28063,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -21489,6 +28091,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -21508,6 +28116,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -21532,6 +28146,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -21554,6 +28174,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21574,6 +28200,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21593,6 +28225,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -21613,6 +28251,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -21632,6 +28276,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: private_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -21648,6 +28298,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: private_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -21664,6 +28320,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: private_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -21679,6 +28341,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: private_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -21695,25 +28363,31 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -21747,6 +28421,12 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -21771,6 +28451,12 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -21793,6 +28479,12 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -21812,6 +28504,12 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -21836,6 +28534,12 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -21858,6 +28562,12 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21878,6 +28588,12 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -21897,6 +28613,12 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -21917,6 +28639,12 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -21936,6 +28664,12 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX11-WGP-LABEL: private_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -21952,6 +28686,12 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX11-CU-LABEL: private_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -21968,6 +28708,12 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-LABEL: private_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -21983,6 +28729,12 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-LABEL: private_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -21999,25 +28751,31 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -22051,6 +28809,12 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_seq_cst_ret_cmpxch ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -22075,6 +28839,12 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_seq_cst_ret_cmpxch ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -22097,6 +28867,12 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_seq_cst_ret_cmpxch ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -22116,6 +28892,12 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_seq_cst_ret_cmpxch ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -22140,6 +28922,12 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_seq_cst_ret_cmpxch ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -22162,6 +28950,12 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_seq_cst_ret_cmpxch ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -22182,6 +28976,12 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_seq_cst_ret_cmpxch ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -22201,6 +29001,12 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_seq_cst_ret_cmpxch ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -22221,6 +29027,12 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_seq_cst_ret_cmpxch ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -22240,6 +29052,12 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_seq_cst_ret_cmpxch ; GFX11-WGP-LABEL: private_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -22256,6 +29074,12 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_seq_cst_ret_cmpxch ; GFX11-CU-LABEL: private_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -22272,6 +29096,12 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_seq_cst_ret_cmpxch ; GFX12-WGP-LABEL: private_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -22287,6 +29117,12 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_seq_cst_ret_cmpxch ; GFX12-CU-LABEL: private_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -22303,25 +29139,31 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_seq_cst_ret_cmpxch ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -22355,6 +29197,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -22379,6 +29227,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -22401,6 +29255,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -22420,6 +29280,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -22444,6 +29310,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -22466,6 +29338,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -22486,6 +29364,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -22505,6 +29389,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -22525,6 +29415,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -22544,6 +29440,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: private_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -22560,6 +29462,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: private_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -22576,6 +29484,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: private_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -22591,6 +29505,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: private_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -22607,25 +29527,31 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -22659,6 +29585,12 @@ define amdgpu_kernel void @private_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -22683,6 +29615,12 @@ define amdgpu_kernel void @private_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -22705,6 +29643,12 @@ define amdgpu_kernel void @private_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -22724,6 +29668,12 @@ define amdgpu_kernel void @private_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -22748,6 +29698,12 @@ define amdgpu_kernel void @private_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -22770,6 +29726,12 @@ define amdgpu_kernel void @private_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -22790,6 +29752,12 @@ define amdgpu_kernel void @private_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -22809,6 +29777,12 @@ define amdgpu_kernel void @private_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -22829,6 +29803,12 @@ define amdgpu_kernel void @private_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -22848,6 +29828,12 @@ define amdgpu_kernel void @private_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: private_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -22864,6 +29850,12 @@ define amdgpu_kernel void @private_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: private_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -22880,6 +29872,12 @@ define amdgpu_kernel void @private_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: private_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -22895,6 +29893,12 @@ define amdgpu_kernel void @private_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: private_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -22911,25 +29915,31 @@ define amdgpu_kernel void @private_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -22963,6 +29973,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -22987,6 +30003,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -23009,6 +30031,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -23028,6 +30056,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -23052,6 +30086,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -23074,6 +30114,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -23094,6 +30140,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -23113,6 +30165,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -23133,6 +30191,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -23152,6 +30216,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: private_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -23168,6 +30238,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: private_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -23184,6 +30260,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: private_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -23199,6 +30281,12 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: private_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -23215,25 +30303,31 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 @@ -23267,6 +30361,12 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX6-NEXT: s_mov_b32 s5, 16 @@ -23291,6 +30391,12 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s5, 16 @@ -23313,6 +30419,12 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -23332,6 +30444,12 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -23356,6 +30474,12 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 @@ -23378,6 +30502,12 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -23398,6 +30528,12 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -23417,6 +30553,12 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_nop 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -23437,6 +30579,12 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX942-TGSPLIT-NEXT: s_nop 0 ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -23456,6 +30604,12 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-WGP-LABEL: private_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s1, 16 @@ -23472,6 +30626,12 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-CU-LABEL: private_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s1, 16 @@ -23488,6 +30648,12 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-LABEL: private_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -23503,6 +30669,12 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-LABEL: private_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -23519,25 +30691,31 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_mov_b32 s3, 16 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 -; GFX1250-NEXT: s_mov_b32 s3, 0 -; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 -; GFX1250-NEXT: s_mov_b32 s3, 20 -; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s0, s3 +; GFX1250-NEXT: s_mov_b32 s4, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s4 +; GFX1250-NEXT: s_mov_b32 s4, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo -; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 s4, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s4 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s3, s7 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: s_mov_b32 s3, s6 diff --git a/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll index 8f64e3c5d72d..0300574b9f7d 100644 --- a/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll +++ b/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll @@ -17,6 +17,10 @@ define amdgpu_kernel void @spill_sgprs_to_multiple_vgprs(ptr addrspace(1) %out, ; GCN-NEXT: s_mov_b32 s95, 0xe8f000 ; GCN-NEXT: s_add_u32 s92, s92, s11 ; GCN-NEXT: s_addc_u32 s93, s93, 0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_load_dword s0, s[4:5], 0xb +; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_load_dword s0, s[4:5], 0xb ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] @@ -219,8 +223,12 @@ define amdgpu_kernel void @spill_sgprs_to_multiple_vgprs(ptr addrspace(1) %out, ; GCN-NEXT: s_mov_b64 exec, s[34:35] ; GCN-NEXT: s_mov_b32 s1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s0, s1 -; GCN-NEXT: s_cbranch_scc1 .LBB0_2 +; GCN-NEXT: s_cmp_eq_u32 s0, s1 +; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN-NEXT: s_mov_b64 s[2:3], -1 +; GCN-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] +; GCN-NEXT: s_and_b64 vcc, exec, s[0:1] +; GCN-NEXT: s_cbranch_vccnz .LBB0_2 ; GCN-NEXT: ; %bb.1: ; %bb0 ; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GCN-NEXT: buffer_load_dword v0, off, s[92:95], 0 offset:4 ; 4-byte Folded Reload @@ -478,6 +486,10 @@ define amdgpu_kernel void @split_sgpr_spill_2_vgprs(ptr addrspace(1) %out, i32 % ; GCN-NEXT: s_mov_b32 s55, 0xe8f000 ; GCN-NEXT: s_add_u32 s52, s52, s11 ; GCN-NEXT: s_addc_u32 s53, s53, 0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_load_dword s0, s[4:5], 0xb +; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_load_dword s0, s[4:5], 0xb ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:19] @@ -581,8 +593,12 @@ define amdgpu_kernel void @split_sgpr_spill_2_vgprs(ptr addrspace(1) %out, i32 % ; GCN-NEXT: s_mov_b64 exec, s[28:29] ; GCN-NEXT: s_mov_b32 s1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s0, s1 -; GCN-NEXT: s_cbranch_scc1 .LBB1_2 +; GCN-NEXT: s_cmp_eq_u32 s0, s1 +; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN-NEXT: s_mov_b64 s[2:3], -1 +; GCN-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] +; GCN-NEXT: s_and_b64 vcc, exec, s[0:1] +; GCN-NEXT: s_cbranch_vccnz .LBB1_2 ; GCN-NEXT: ; %bb.1: ; %bb0 ; GCN-NEXT: s_or_saveexec_b64 s[28:29], -1 ; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 ; 4-byte Folded Reload @@ -721,6 +737,10 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill(ptr addrspace(1) %out, i32 % ; GCN-NEXT: s_mov_b32 s55, 0xe8f000 ; GCN-NEXT: s_add_u32 s52, s52, s11 ; GCN-NEXT: s_addc_u32 s53, s53, 0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_load_dword s0, s[4:5], 0xb +; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_load_dword s0, s[4:5], 0xb ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND @@ -825,8 +845,12 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill(ptr addrspace(1) %out, i32 % ; GCN-NEXT: s_mov_b64 exec, s[34:35] ; GCN-NEXT: s_mov_b32 s1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s0, s1 -; GCN-NEXT: s_cbranch_scc1 .LBB2_2 +; GCN-NEXT: s_cmp_eq_u32 s0, s1 +; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN-NEXT: s_mov_b64 s[2:3], -1 +; GCN-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] +; GCN-NEXT: s_and_b64 vcc, exec, s[0:1] +; GCN-NEXT: s_cbranch_vccnz .LBB2_2 ; GCN-NEXT: ; %bb.1: ; %bb0 ; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GCN-NEXT: buffer_load_dword v31, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload @@ -958,6 +982,8 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill_live_v0(i32 %in) #1 { ; GCN-NEXT: s_add_u32 s52, s52, s11 ; GCN-NEXT: s_addc_u32 s53, s53, 0 ; GCN-NEXT: s_load_dword s0, s[4:5], 0x9 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_load_dword s0, s[4:5], 0x9 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: ;;#ASMSTART @@ -1061,8 +1087,12 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill_live_v0(i32 %in) #1 { ; GCN-NEXT: s_mov_b64 exec, s[34:35] ; GCN-NEXT: s_mov_b32 s1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s0, s1 -; GCN-NEXT: s_cbranch_scc1 .LBB3_2 +; GCN-NEXT: s_cmp_eq_u32 s0, s1 +; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN-NEXT: s_mov_b64 s[2:3], -1 +; GCN-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] +; GCN-NEXT: s_and_b64 vcc, exec, s[0:1] +; GCN-NEXT: s_cbranch_vccnz .LBB3_2 ; GCN-NEXT: ; %bb.1: ; %bb0 ; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GCN-NEXT: buffer_load_dword v31, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload diff --git a/llvm/test/CodeGen/AMDGPU/rem_i128.ll b/llvm/test/CodeGen/AMDGPU/rem_i128.ll index 0ead77e1a9cf..35c318e9be4f 100644 --- a/llvm/test/CodeGen/AMDGPU/rem_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/rem_i128.ll @@ -236,256 +236,279 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0: ; %bb.0: ; %_udiv-special-cases ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v2 +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v6 +; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7 +; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v12, v2 +; GFX9-O0-NEXT: s_mov_b32 s4, 63 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v12 +; GFX9-O0-NEXT: v_ashrrev_i64 v[9:10], s4, v[7:8] +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v10 +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-O0-NEXT: v_ashrrev_i64 v[14:15], s4, v[7:8] +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v10 +; GFX9-O0-NEXT: v_xor_b32_e64 v2, v2, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v9 +; GFX9-O0-NEXT: v_xor_b32_e64 v12, v7, v11 +; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v1 -; GFX9-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v7 -; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v5 +; GFX9-O0-NEXT: v_xor_b32_e64 v2, v2, v8 +; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX9-O0-NEXT: v_xor_b32_e64 v0, v0, v11 ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v15 +; GFX9-O0-NEXT: v_xor_b32_e64 v9, v7, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v14 +; GFX9-O0-NEXT: v_xor_b32_e64 v6, v6, v5 +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v4 +; GFX9-O0-NEXT: v_xor_b32_e64 v9, v9, v2 +; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $vgpr3_vgpr4 killed $exec +; GFX9-O0-NEXT: v_xor_b32_e64 v3, v3, v5 +; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v0 +; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $vgpr0_vgpr1 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v13 +; GFX9-O0-NEXT: v_sub_co_u32_e32 v9, vcc, v9, v11 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v8, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v12, vcc, v10, v11, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v8, vcc +; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v1 ; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v3 -; GFX9-O0-NEXT: s_mov_b32 s4, 63 +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v7 +; GFX9-O0-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v5 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v2, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v14, vcc, v4, v5, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v2, vcc, v1, v2, vcc +; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v12 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v13 -; GFX9-O0-NEXT: v_ashrrev_i64 v[2:3], s4, v[2:3] -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v10 +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v14 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v15 +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v12 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v13 -; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0 -; GFX9-O0-NEXT: ; implicit-def: $vgpr30 : SGPR spill to VGPR lane -; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 0 -; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 1 -; GFX9-O0-NEXT: s_mov_b32 s10, s6 -; GFX9-O0-NEXT: v_writelane_b32 v30, s10, 2 -; GFX9-O0-NEXT: s_mov_b32 s11, s7 -; GFX9-O0-NEXT: v_writelane_b32 v30, s11, 3 -; GFX9-O0-NEXT: v_sub_co_u32_e32 v10, vcc, s10, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v5, vcc, v4, v3, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v4, s10 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v4, vcc, v4, v0, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v6, s11 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v6, vcc, v6, v1, vcc -; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v11 -; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX9-O0-NEXT: v_cmp_lt_i64_e64 s[4:5], v[12:13], s[4:5] -; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[4:5] -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v10 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[4:5] -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v3 -; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[4:5] -; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec -; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[4:5] +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v14 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v15 +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v13 +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v10 +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v15 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v1 +; GFX9-O0-NEXT: v_or_b32_e64 v2, v8, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v14 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v9 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v17 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v18 -; GFX9-O0-NEXT: v_sub_co_u32_e32 v12, vcc, s10, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, s11 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v14, vcc, v9, v10, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v9, s10 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v11, vcc, v9, v4, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v9, s11 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v9, vcc, v9, v8, vcc -; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v14 -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v13 -; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX9-O0-NEXT: v_cmp_lt_i64_e64 s[4:5], v[17:18], s[4:5] -; GFX9-O0-NEXT: v_cndmask_b32_e64 v10, v10, v14, s[4:5] -; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 killed $vgpr12_vgpr13 killed $exec -; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, v7, v12, s[4:5] -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v10 -; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v9 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v12 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[4:5] -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v11 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v9, s[4:5] -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v5 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v15 -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v16 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v12 -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v11 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v17 -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v18 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v12 -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v11 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v17 -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v18 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v5 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v15 -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v16 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v12 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v18 -; GFX9-O0-NEXT: v_or_b32_e64 v9, v9, v13 -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v11 -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v17 -; GFX9-O0-NEXT: v_or_b32_e64 v13, v13, v14 -; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v9 -; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[13:14], s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v16 -; GFX9-O0-NEXT: v_or_b32_e64 v9, v9, v13 -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v15 -; GFX9-O0-NEXT: v_or_b32_e64 v13, v13, v14 -; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v9 -; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[13:14], s[6:7] -; GFX9-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[8:9], v[11:12], s[6:7] +; GFX9-O0-NEXT: v_or_b32_e64 v0, v5, v4 +; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 +; GFX9-O0-NEXT: s_mov_b64 s[8:9], 0 +; GFX9-O0-NEXT: ; implicit-def: $vgpr31 : SGPR spill to VGPR lane +; GFX9-O0-NEXT: v_writelane_b32 v31, s8, 0 +; GFX9-O0-NEXT: v_writelane_b32 v31, s9, 1 +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[0:1], s[8:9] +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v10 +; GFX9-O0-NEXT: v_or_b32_e64 v7, v3, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v9 +; GFX9-O0-NEXT: v_or_b32_e64 v9, v2, v0 +; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v7 +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[6:7], v[9:10], s[8:9] +; GFX9-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] ; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v4 -; GFX9-O0-NEXT: s_mov_b32 s12, 32 -; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s12 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v8, v8 -; GFX9-O0-NEXT: v_min_u32_e64 v8, v4, v8 +; GFX9-O0-NEXT: s_mov_b32 s6, 32 +; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s6 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v6, v6 +; GFX9-O0-NEXT: v_min_u32_e64 v6, v4, v6 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v7 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v5 +; GFX9-O0-NEXT: v_add_u32_e64 v5, v5, s6 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v8, v8 +; GFX9-O0-NEXT: v_min_u32_e64 v16, v5, v8 +; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v4 +; GFX9-O0-NEXT: s_mov_b64 s[10:11], 64 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v16 +; GFX9-O0-NEXT: s_mov_b32 s12, s10 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v17 +; GFX9-O0-NEXT: s_mov_b32 s7, s11 +; GFX9-O0-NEXT: v_add_co_u32_e64 v8, s[12:13], v8, s12 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, s7 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v5, s[12:13], v5, v9, s[12:13] ; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v9 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v7, v7 -; GFX9-O0-NEXT: v_add_u32_e64 v7, v7, s12 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v10, v10 -; GFX9-O0-NEXT: v_min_u32_e64 v13, v7, v10 -; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v4 -; GFX9-O0-NEXT: s_mov_b64 s[14:15], 64 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v13 -; GFX9-O0-NEXT: s_mov_b32 s16, s14 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v14 -; GFX9-O0-NEXT: s_mov_b32 s13, s15 -; GFX9-O0-NEXT: v_add_co_u32_e64 v10, s[16:17], v10, s16 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, s13 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v7, s[16:17], v7, v11, s[16:17] -; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v11 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, v7, v12, s[8:9] -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v10 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[8:9] +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 +; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9] +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[12:13], v[14:15], s[12:13] +; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[12:13] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v10, s[14:15] +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v8 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v8, v6, v7, s[12:13] ; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7 -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[8:9], v[5:6], s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v5 ; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v0 -; GFX9-O0-NEXT: v_add_u32_e64 v5, v5, s12 +; GFX9-O0-NEXT: v_add_u32_e64 v5, v5, s6 ; GFX9-O0-NEXT: v_ffbh_u32_e64 v6, v1 ; GFX9-O0-NEXT: v_min_u32_e64 v5, v5, v6 ; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 ; GFX9-O0-NEXT: v_ffbh_u32_e64 v10, v2 -; GFX9-O0-NEXT: v_add_u32_e64 v10, v10, s12 +; GFX9-O0-NEXT: v_add_u32_e64 v10, v10, s6 ; GFX9-O0-NEXT: v_ffbh_u32_e64 v11, v3 -; GFX9-O0-NEXT: v_min_u32_e64 v11, v10, v11 -; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v11 -; GFX9-O0-NEXT: s_mov_b32 s12, s14 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v12 -; GFX9-O0-NEXT: s_mov_b32 s14, s15 -; GFX9-O0-NEXT: v_add_co_u32_e64 v10, s[12:13], v10, s12 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, s14 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v4, s[12:13], v4, v11, s[12:13] +; GFX9-O0-NEXT: v_min_u32_e64 v14, v10, v11 +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v14 +; GFX9-O0-NEXT: s_mov_b32 s6, s10 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v15 +; GFX9-O0-NEXT: s_mov_b32 s10, s11 +; GFX9-O0-NEXT: v_add_co_u32_e64 v10, s[6:7], v10, s6 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, s10 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v4, s[6:7], v4, v11, s[6:7] ; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v11, v4 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v11 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[8:9] +; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[6:7], v[12:13], s[6:7] +; GFX9-O0-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[10:11] ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v10 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[8:9] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[6:7] ; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v8 ; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr5_vgpr6 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 +; GFX9-O0-NEXT: s_mov_b32 s7, s8 +; GFX9-O0-NEXT: s_mov_b32 s14, s9 ; GFX9-O0-NEXT: v_sub_co_u32_e32 v4, vcc, v4, v7 ; GFX9-O0-NEXT: v_subb_co_u32_e32 v8, vcc, v5, v6, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v6, s10 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, s10 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, s7 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s7 ; GFX9-O0-NEXT: v_subb_co_u32_e32 v7, vcc, v5, v6, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v6, s11 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, s14 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s14 ; GFX9-O0-NEXT: v_subb_co_u32_e32 v6, vcc, v5, v6, vcc ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8 -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6 -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7 +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[6:7] +; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v4 +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[10:11], v[7:8], s[10:11] ; GFX9-O0-NEXT: s_mov_b64 s[12:13], 0x7f -; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[12:13] -; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15] -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[6:7] -; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[14:15] -; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9] +; GFX9-O0-NEXT: s_mov_b64 s[16:17], s[12:13] +; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[16:17], v[4:5], s[16:17] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[16:17] +; GFX9-O0-NEXT: s_mov_b64 s[16:17], s[8:9] +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[16:17], v[7:8], s[16:17] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[16:17] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[10:11] ; GFX9-O0-NEXT: v_and_b32_e64 v6, 1, v6 -; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[8:9], v6, 1 -; GFX9-O0-NEXT: s_or_b64 s[8:9], s[4:5], s[8:9] -; GFX9-O0-NEXT: s_mov_b64 s[14:15], -1 -; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[8:9] -; GFX9-O0-NEXT: s_xor_b64 s[4:5], s[4:5], s[14:15] +; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[10:11], v6, 1 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[10:11] +; GFX9-O0-NEXT: s_mov_b32 s6, 1 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, s6 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[4:5] +; GFX9-O0-NEXT: v_and_b32_e64 v6, 1, v6 +; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], v6, 1 +; GFX9-O0-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 -; GFX9-O0-NEXT: s_mov_b32 s14, s13 -; GFX9-O0-NEXT: v_xor_b32_e64 v6, v6, s14 +; GFX9-O0-NEXT: s_mov_b32 s15, s13 +; GFX9-O0-NEXT: v_xor_b32_e64 v6, v6, s15 +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13 ; GFX9-O0-NEXT: v_xor_b32_e64 v4, v4, s12 ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec @@ -498,23 +521,29 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_or_b32_e64 v4, v4, v5 ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6 -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[6:7], v[4:5], s[6:7] -; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9] -; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11 +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[4:5], s[8:9] +; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[10:11] +; GFX9-O0-NEXT: v_mov_b32_e32 v4, s14 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v1, v4, s[12:13] -; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9] -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10 +; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[10:11] +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[12:13] ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4 -; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9] -; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11 +; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[10:11] +; GFX9-O0-NEXT: v_mov_b32_e32 v4, s14 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v3, v4, s[12:13] -; GFX9-O0-NEXT: v_mov_b32_e32 v3, s10 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[8:9] +; GFX9-O0-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[10:11] ; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v4 -; GFX9-O0-NEXT: s_and_b64 s[6:7], s[4:5], s[6:7] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[8:9] +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[4:5] +; GFX9-O0-NEXT: v_and_b32_e64 v4, 1, v4 +; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, 1 +; GFX9-O0-NEXT: s_mov_b64 s[6:7], -1 +; GFX9-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill @@ -522,10 +551,10 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 s[4:5], exec -; GFX9-O0-NEXT: v_writelane_b32 v30, s4, 4 -; GFX9-O0-NEXT: v_writelane_b32 v30, s5, 5 +; GFX9-O0-NEXT: v_writelane_b32 v31, s4, 2 +; GFX9-O0-NEXT: v_writelane_b32 v31, s5, 3 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v31, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] @@ -533,67 +562,67 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_branch .LBB0_8 ; GFX9-O0-NEXT: .LBB0_1: ; %Flow ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 6 -; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 7 +; GFX9-O0-NEXT: v_readlane_b32 s4, v31, 4 +; GFX9-O0-NEXT: v_readlane_b32 s5, v31, 5 ; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-O0-NEXT: ; %bb.2: ; %Flow -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB0_5 ; GFX9-O0-NEXT: .LBB0_3: ; %Flow2 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 4 -; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 5 +; GFX9-O0-NEXT: v_readlane_b32 s4, v31, 2 +; GFX9-O0-NEXT: v_readlane_b32 s5, v31, 3 ; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB0_9 ; GFX9-O0-NEXT: .LBB0_4: ; %udiv-loop-exit -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b32 s4, 1 ; GFX9-O0-NEXT: s_waitcnt vmcnt(2) ; GFX9-O0-NEXT: v_lshlrev_b64 v[2:3], s4, v[0:1] @@ -627,66 +656,66 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_branch .LBB0_3 ; GFX9-O0-NEXT: .LBB0_5: ; %Flow1 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 8 -; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 9 +; GFX9-O0-NEXT: v_readlane_b32 s4, v31, 6 +; GFX9-O0-NEXT: v_readlane_b32 s5, v31, 7 ; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB0_4 ; GFX9-O0-NEXT: .LBB0_6: ; %udiv-do-while ; GFX9-O0-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s6, v30, 10 -; GFX9-O0-NEXT: v_readlane_b32 s7, v30, 11 -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_readlane_b32 s6, v31, 8 +; GFX9-O0-NEXT: v_readlane_b32 s7, v31, 9 +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b32 s4, 63 ; GFX9-O0-NEXT: s_waitcnt vmcnt(16) ; GFX9-O0-NEXT: v_lshrrev_b64 v[28:29], s4, v[2:3] @@ -817,72 +846,72 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v17, v3 ; GFX9-O0-NEXT: v_mov_b32_e32 v16, v2 -; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v0 -; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v15 -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v14 ; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v13 -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v0 ; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v15 +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v14 +; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v12 +; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 6 -; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 7 +; GFX9-O0-NEXT: v_writelane_b32 v31, s6, 4 +; GFX9-O0-NEXT: v_writelane_b32 v31, s7, 5 ; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 10 -; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 11 +; GFX9-O0-NEXT: v_writelane_b32 v31, s6, 8 +; GFX9-O0-NEXT: v_writelane_b32 v31, s7, 9 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v31, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX9-O0-NEXT: s_cbranch_execnz .LBB0_6 ; GFX9-O0-NEXT: s_branch .LBB0_1 ; GFX9-O0-NEXT: .LBB0_7: ; %udiv-preheader ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload @@ -912,13 +941,17 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_lshrrev_b64 v[20:21], v5, v[14:15] ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v21 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v22, s[6:7] -; GFX9-O0-NEXT: s_mov_b32 s4, 0 -; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, s4 -; GFX9-O0-NEXT: v_mov_b32_e32 v22, v19 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v22, s[4:5] ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v20 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[6:7] +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7 +; GFX9-O0-NEXT: s_mov_b32 s4, 0 +; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, s4 +; GFX9-O0-NEXT: v_mov_b32_e32 v20, v19 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v20, s[4:5] +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v18 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[4:5] ; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec @@ -958,115 +991,126 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v12, s4 ; GFX9-O0-NEXT: v_mov_b32_e32 v15, s5 ; GFX9-O0-NEXT: v_mov_b32_e32 v14, s4 -; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_writelane_b32 v30, s4, 10 -; GFX9-O0-NEXT: v_writelane_b32 v30, s5, 11 +; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_writelane_b32 v31, s4, 8 +; GFX9-O0-NEXT: v_writelane_b32 v31, s5, 9 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v31, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB0_6 ; GFX9-O0-NEXT: .LBB0_8: ; %udiv-bb1 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 1 ; GFX9-O0-NEXT: s_mov_b32 s5, s6 ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-O0-NEXT: s_mov_b32 s4, s7 +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1 ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0 ; GFX9-O0-NEXT: s_mov_b32 s8, s6 ; GFX9-O0-NEXT: s_mov_b32 s9, s7 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, s5 -; GFX9-O0-NEXT: v_add_co_u32_e32 v8, vcc, v3, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v4, s8 -; GFX9-O0-NEXT: v_addc_co_u32_e32 v0, vcc, v0, v4, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v4, s9 -; GFX9-O0-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v4, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-O0-NEXT: v_add_co_u32_e32 v8, vcc, v2, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s8 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v0, vcc, v0, v5, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s9 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc ; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v9, v1 ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v0 +; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b32 s4, 0x7f -; GFX9-O0-NEXT: v_sub_u32_e64 v2, s4, v3 +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_mov_b64 s[10:11], 0x7f +; GFX9-O0-NEXT: s_mov_b32 s5, s10 +; GFX9-O0-NEXT: s_mov_b32 s4, s11 +; GFX9-O0-NEXT: v_sub_co_u32_e32 v2, vcc, s5, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v4, vcc, v3, v4, vcc +; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v4 +; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $vgpr2_vgpr3 killed $exec ; GFX9-O0-NEXT: v_lshlrev_b64 v[4:5], v2, v[10:11] -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v5 -; GFX9-O0-NEXT: s_mov_b32 s4, 64 -; GFX9-O0-NEXT: v_sub_u32_e64 v13, s4, v2 -; GFX9-O0-NEXT: v_lshrrev_b64 v[13:14], v13, v[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v14 -; GFX9-O0-NEXT: v_or_b32_e64 v12, v12, v15 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v5 +; GFX9-O0-NEXT: s_mov_b32 s10, 64 +; GFX9-O0-NEXT: v_sub_u32_e64 v12, s10, v2 +; GFX9-O0-NEXT: v_lshrrev_b64 v[12:13], v12, v[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v13 +; GFX9-O0-NEXT: v_or_b32_e64 v3, v3, v14 ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v12 ; GFX9-O0-NEXT: v_or_b32_e64 v4, v4, v5 ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-O0-NEXT: v_mov_b32_e32 v14, v5 -; GFX9-O0-NEXT: v_cmp_lt_u32_e64 s[4:5], v2, s4 -; GFX9-O0-NEXT: s_mov_b32 s10, 63 -; GFX9-O0-NEXT: v_sub_u32_e64 v3, s10, v3 +; GFX9-O0-NEXT: v_cmp_lt_u32_e64 s[4:5], v2, s10 +; GFX9-O0-NEXT: v_sub_u32_e64 v3, v2, s10 ; GFX9-O0-NEXT: v_lshlrev_b64 v[12:13], v3, v[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v13 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v14, s[4:5] -; GFX9-O0-NEXT: s_mov_b32 s10, 0 -; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[10:11], v2, s10 -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v11 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v14, s[10:11] ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v4 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v12 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[4:5] +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v5 +; GFX9-O0-NEXT: s_mov_b32 s10, 0 +; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[10:11], v2, s10 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v11 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v12, s[10:11] +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v10 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[10:11] ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec @@ -1080,12 +1124,12 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v3, v6, s[4:5] ; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v2 -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v9 ; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3 @@ -1099,25 +1143,25 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 s[6:7], exec ; GFX9-O0-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] ; GFX9-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] -; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 8 -; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 9 +; GFX9-O0-NEXT: v_writelane_b32 v31, s6, 6 +; GFX9-O0-NEXT: v_writelane_b32 v31, s7, 7 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v31, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O0-NEXT: s_cbranch_execz .LBB0_5 @@ -1131,206 +1175,235 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b32 s4, 32 +; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 s[4:5], 0xffffffff +; GFX9-O0-NEXT: s_mov_b32 s6, s5 ; GFX9-O0-NEXT: s_waitcnt vmcnt(2) -; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s4, v[17:18] +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v16 +; GFX9-O0-NEXT: v_and_b32_e64 v2, v0, s6 +; GFX9-O0-NEXT: s_mov_b32 s5, s4 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v15 +; GFX9-O0-NEXT: v_and_b32_e64 v0, v5, s5 +; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-O0-NEXT: s_waitcnt vmcnt(1) -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v21 -; GFX9-O0-NEXT: v_mul_lo_u32 v8, v1, v0 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v9 +; GFX9-O0-NEXT: v_and_b32_e64 v0, v0, s6 +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v8 +; GFX9-O0-NEXT: v_and_b32_e64 v23, v18, s5 +; GFX9-O0-NEXT: ; kill: def $vgpr23 killed $vgpr23 def $vgpr23_vgpr24 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v24, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v23 +; GFX9-O0-NEXT: v_mad_u64_u32 v[23:24], s[8:9], v14, v1, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v25, v23 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-O0-NEXT: ; kill: def $vgpr25 killed $vgpr25 def $vgpr25_vgpr26 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v26, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v26 +; GFX9-O0-NEXT: v_mov_b32_e32 v23, v24 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; implicit-def: $sgpr7 +; GFX9-O0-NEXT: ; kill: def $vgpr23 killed $vgpr23 def $vgpr23_vgpr24 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v24, s4 +; GFX9-O0-NEXT: s_mov_b32 s4, 32 +; GFX9-O0-NEXT: v_lshlrev_b64 v[23:24], s4, v[23:24] +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v24 +; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v17 +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v25 +; GFX9-O0-NEXT: ; kill: def $vgpr23 killed $vgpr23 killed $vgpr23_vgpr24 killed $exec +; GFX9-O0-NEXT: v_or_b32_e64 v23, v17, v23 +; GFX9-O0-NEXT: ; kill: def $vgpr23 killed $vgpr23 def $vgpr23_vgpr24 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v24, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v24 +; GFX9-O0-NEXT: v_and_b32_e64 v2, v2, s6 +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v23 +; GFX9-O0-NEXT: v_and_b32_e64 v27, v17, s5 +; GFX9-O0-NEXT: ; kill: def $vgpr27 killed $vgpr27 def $vgpr27_vgpr28 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v28, v2 +; GFX9-O0-NEXT: v_lshrrev_b64 v[23:24], s4, v[23:24] +; GFX9-O0-NEXT: v_lshrrev_b64 v[8:9], s4, v[8:9] +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 killed $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mad_u64_u32 v[25:26], s[8:9], v8, v1, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v29, v25 +; GFX9-O0-NEXT: ; kill: def $vgpr29 killed $vgpr29 def $vgpr29_vgpr30 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v30, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v30 +; GFX9-O0-NEXT: v_mov_b32_e32 v25, v26 +; GFX9-O0-NEXT: ; implicit-def: $sgpr7 +; GFX9-O0-NEXT: ; implicit-def: $sgpr8 +; GFX9-O0-NEXT: ; kill: def $vgpr25 killed $vgpr25 def $vgpr25_vgpr26 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v26, s7 +; GFX9-O0-NEXT: v_lshlrev_b64 v[25:26], s4, v[25:26] +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v26 +; GFX9-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v29 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v25 +; GFX9-O0-NEXT: v_or_b32_e64 v25, v2, v9 +; GFX9-O0-NEXT: ; kill: def $vgpr25 killed $vgpr25 def $vgpr25_vgpr26 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v26, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v25 +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v23 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v26 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v24 +; GFX9-O0-NEXT: v_add_co_u32_e64 v1, s[8:9], v1, v17 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v9, s[8:9], v2, v9, s[8:9] +; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v2 +; GFX9-O0-NEXT: v_and_b32_e64 v9, v9, s6 +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v1 +; GFX9-O0-NEXT: v_and_b32_e64 v23, v17, s5 +; GFX9-O0-NEXT: ; kill: def $vgpr23 killed $vgpr23 def $vgpr23_vgpr24 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v24, v9 +; GFX9-O0-NEXT: v_lshrrev_b64 v[15:16], s4, v[15:16] +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v15 +; GFX9-O0-NEXT: v_mad_u64_u32 v[25:26], s[6:7], v14, v9, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v25 +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v15 +; GFX9-O0-NEXT: v_mov_b32_e32 v25, v26 +; GFX9-O0-NEXT: ; implicit-def: $sgpr5 +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: ; kill: def $vgpr25 killed $vgpr25 def $vgpr25_vgpr26 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v26, s5 +; GFX9-O0-NEXT: v_lshlrev_b64 v[25:26], s4, v[25:26] +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v26 +; GFX9-O0-NEXT: v_or_b32_e64 v16, v16, v17 +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 killed $vgpr14_vgpr15 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v25 +; GFX9-O0-NEXT: v_or_b32_e64 v14, v14, v15 +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v16 +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v14 +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v23 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v15 +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v24 +; GFX9-O0-NEXT: v_add_co_u32_e64 v16, s[6:7], v16, v17 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v14, s[6:7], v14, v15, s[6:7] +; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v14 +; GFX9-O0-NEXT: v_lshlrev_b64 v[25:26], s4, v[16:17] +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v27 +; GFX9-O0-NEXT: v_mov_b32_e32 v24, v25 +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v28 +; GFX9-O0-NEXT: v_mov_b32_e32 v23, v26 +; GFX9-O0-NEXT: v_add_co_u32_e64 v14, s[6:7], v14, v24 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v23, s[6:7], v15, v23, s[6:7] +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v23 +; GFX9-O0-NEXT: v_lshrrev_b64 v[23:24], s4, v[16:17] +; GFX9-O0-NEXT: v_lshrrev_b64 v[1:2], s4, v[1:2] +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v23 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v24 +; GFX9-O0-NEXT: v_add_co_u32_e64 v23, s[6:7], v16, v17 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v1, s[6:7], v1, v2, s[6:7] +; GFX9-O0-NEXT: ; kill: def $vgpr23 killed $vgpr23 def $vgpr23_vgpr24 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v24, v1 +; GFX9-O0-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v8, v9, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v25, v16 +; GFX9-O0-NEXT: ; kill: def $vgpr25 killed $vgpr25 def $vgpr25_vgpr26 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v26, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v26 +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v17 +; GFX9-O0-NEXT: ; implicit-def: $sgpr5 +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v17, s5 +; GFX9-O0-NEXT: v_lshlrev_b64 v[16:17], s4, v[16:17] +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v17 +; GFX9-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v25 +; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 killed $vgpr16_vgpr17 killed $exec +; GFX9-O0-NEXT: v_or_b32_e64 v25, v2, v16 +; GFX9-O0-NEXT: ; kill: def $vgpr25 killed $vgpr25 def $vgpr25_vgpr26 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v26, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v25 +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v23 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v26 +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v24 +; GFX9-O0-NEXT: v_add_co_u32_e64 v1, s[6:7], v1, v17 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v16, s[6:7], v2, v16, s[6:7] +; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v16 +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v21 +; GFX9-O0-NEXT: v_mul_lo_u32 v16, v9, v17 ; GFX9-O0-NEXT: v_lshrrev_b64 v[21:22], s4, v[21:22] -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v21 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v17 -; GFX9-O0-NEXT: v_mul_lo_u32 v2, v5, v2 -; GFX9-O0-NEXT: v_mad_u64_u32 v[17:18], s[6:7], v5, v0, 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v18 -; GFX9-O0-NEXT: v_add3_u32 v8, v0, v2, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v21 +; GFX9-O0-NEXT: v_mul_lo_u32 v9, v5, v9 +; GFX9-O0-NEXT: v_mad_u64_u32 v[21:22], s[6:7], v5, v17, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v22 +; GFX9-O0-NEXT: v_add3_u32 v16, v5, v9, v16 +; GFX9-O0-NEXT: ; implicit-def: $sgpr5 +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v17, s5 +; GFX9-O0-NEXT: v_lshlrev_b64 v[16:17], s4, v[16:17] +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v17 +; GFX9-O0-NEXT: ; kill: def $vgpr21 killed $vgpr21 killed $vgpr21_vgpr22 killed $exec +; GFX9-O0-NEXT: ; kill: def $vgpr21 killed $vgpr21 def $vgpr21_vgpr22 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v22, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v22 +; GFX9-O0-NEXT: v_or_b32_e64 v5, v5, v9 +; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 killed $vgpr16_vgpr17 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v21 +; GFX9-O0-NEXT: v_or_b32_e64 v16, v9, v16 +; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v19 +; GFX9-O0-NEXT: v_mul_lo_u32 v8, v5, v8 +; GFX9-O0-NEXT: v_lshrrev_b64 v[19:20], s4, v[19:20] +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v19 +; GFX9-O0-NEXT: v_mul_lo_u32 v9, v9, v18 +; GFX9-O0-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v5, v18, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v19 +; GFX9-O0-NEXT: v_add3_u32 v8, v5, v8, v9 ; GFX9-O0-NEXT: ; implicit-def: $sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 ; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v9, s5 ; GFX9-O0-NEXT: v_lshlrev_b64 v[8:9], s4, v[8:9] -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v9 -; GFX9-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 killed $vgpr17_vgpr18 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v18 -; GFX9-O0-NEXT: v_or_b32_e64 v0, v0, v14 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v17 -; GFX9-O0-NEXT: v_or_b32_e64 v17, v8, v9 -; GFX9-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v0 -; GFX9-O0-NEXT: v_lshrrev_b64 v[8:9], s4, v[19:20] -; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 killed $vgpr8_vgpr9 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v15 -; GFX9-O0-NEXT: v_mul_lo_u32 v14, v9, v8 -; GFX9-O0-NEXT: v_lshrrev_b64 v[15:16], s4, v[15:16] -; GFX9-O0-NEXT: ; kill: def $vgpr15 killed $vgpr15 killed $vgpr15_vgpr16 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v19 -; GFX9-O0-NEXT: v_mul_lo_u32 v15, v15, v0 -; GFX9-O0-NEXT: v_mad_u64_u32 v[19:20], s[6:7], v9, v0, 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v20 -; GFX9-O0-NEXT: v_add3_u32 v14, v9, v14, v15 -; GFX9-O0-NEXT: ; implicit-def: $sgpr5 -; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v15, s5 -; GFX9-O0-NEXT: v_lshlrev_b64 v[14:15], s4, v[14:15] -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v15 -; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 killed $vgpr19_vgpr20 killed $exec -; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v20, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v20 -; GFX9-O0-NEXT: v_or_b32_e64 v9, v9, v16 -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v14 -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v19 -; GFX9-O0-NEXT: v_or_b32_e64 v19, v14, v15 -; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v20, v9 -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v19 -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v17 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v20 -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v18 -; GFX9-O0-NEXT: v_add_co_u32_e64 v17, s[6:7], v15, v16 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v9, s[6:7], v9, v14, s[6:7] -; GFX9-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v9 -; GFX9-O0-NEXT: v_mad_u64_u32 v[14:15], s[6:7], v8, v1, 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v19, v14 -; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v20, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v20 -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v15 -; GFX9-O0-NEXT: ; implicit-def: $sgpr5 -; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v15, s5 -; GFX9-O0-NEXT: v_lshlrev_b64 v[15:16], s4, v[14:15] -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v16 -; GFX9-O0-NEXT: v_or_b32_e64 v9, v9, v14 -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v19 -; GFX9-O0-NEXT: ; kill: def $vgpr15 killed $vgpr15 killed $vgpr15_vgpr16 killed $exec -; GFX9-O0-NEXT: v_or_b32_e64 v21, v14, v15 -; GFX9-O0-NEXT: ; kill: def $vgpr21 killed $vgpr21 def $vgpr21_vgpr22 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v22, v9 -; GFX9-O0-NEXT: v_mad_u64_u32 v[14:15], s[6:7], v8, v5, 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v19, v14 -; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v20, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v20 -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v15 -; GFX9-O0-NEXT: ; implicit-def: $sgpr5 -; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v15, s5 -; GFX9-O0-NEXT: v_lshlrev_b64 v[14:15], s4, v[14:15] -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v15 -; GFX9-O0-NEXT: v_or_b32_e64 v8, v8, v9 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v19 -; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 killed $vgpr14_vgpr15 killed $exec -; GFX9-O0-NEXT: v_or_b32_e64 v23, v9, v14 -; GFX9-O0-NEXT: ; kill: def $vgpr23 killed $vgpr23 def $vgpr23_vgpr24 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v24, v8 -; GFX9-O0-NEXT: v_mad_u64_u32 v[14:15], s[6:7], v0, v5, 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v19, v15 -; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v20, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v23 -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v19 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v24 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v20 -; GFX9-O0-NEXT: v_add_co_u32_e64 v8, s[6:7], v8, v16 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v5, s[6:7], v5, v9, s[6:7] -; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v5 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 -; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0xffffffff -; GFX9-O0-NEXT: s_mov_b32 s5, s7 -; GFX9-O0-NEXT: v_and_b32_e64 v5, v5, s5 -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v8 -; GFX9-O0-NEXT: s_mov_b32 s5, s6 -; GFX9-O0-NEXT: v_and_b32_e64 v19, v16, s5 -; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v20, v5 -; GFX9-O0-NEXT: v_mad_u64_u32 v[23:24], s[6:7], v0, v1, 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v25, v23 -; GFX9-O0-NEXT: ; kill: def $vgpr25 killed $vgpr25 def $vgpr25_vgpr26 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v26, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v26 -; GFX9-O0-NEXT: v_mov_b32_e32 v23, v24 -; GFX9-O0-NEXT: ; implicit-def: $sgpr5 -; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: ; kill: def $vgpr23 killed $vgpr23 def $vgpr23_vgpr24 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v24, s5 -; GFX9-O0-NEXT: v_lshlrev_b64 v[23:24], s4, v[23:24] -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v24 -; GFX9-O0-NEXT: v_or_b32_e64 v0, v0, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v25 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v23 -; GFX9-O0-NEXT: v_or_b32_e64 v23, v1, v5 -; GFX9-O0-NEXT: ; kill: def $vgpr23 killed $vgpr23 def $vgpr23_vgpr24 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v24, v0 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v23 -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v19 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v24 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v20 -; GFX9-O0-NEXT: v_add_co_u32_e64 v0, s[6:7], v0, v16 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v5, s[6:7], v1, v5, s[6:7] -; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5 -; GFX9-O0-NEXT: v_lshrrev_b64 v[19:20], s4, v[0:1] -; GFX9-O0-NEXT: v_lshrrev_b64 v[23:24], s4, v[8:9] -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v23 -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v19 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v24 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v20 -; GFX9-O0-NEXT: v_add_co_u32_e64 v19, s[6:7], v9, v16 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v5, s[6:7], v5, v8, s[6:7] -; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v20, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v21 -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v19 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v22 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v20 -; GFX9-O0-NEXT: v_add_co_u32_e64 v19, s[6:7], v9, v16 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v5, s[6:7], v5, v8, s[6:7] -; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v20, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v19 -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v17 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v20 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v18 -; GFX9-O0-NEXT: v_add_co_u32_e64 v8, s[6:7], v8, v16 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v5, s[6:7], v5, v9, s[6:7] +; GFX9-O0-NEXT: ; kill: def $vgpr18 killed $vgpr18 killed $vgpr18_vgpr19 killed $exec +; GFX9-O0-NEXT: ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v19, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v19 +; GFX9-O0-NEXT: v_or_b32_e64 v0, v0, v5 +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 killed $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v18 +; GFX9-O0-NEXT: v_or_b32_e64 v18, v5, v8 +; GFX9-O0-NEXT: ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v19, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v18 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v16 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v19 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v17 +; GFX9-O0-NEXT: v_add_co_u32_e64 v8, s[6:7], v8, v9 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v0, s[6:7], v0, v5, s[6:7] ; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v5 -; GFX9-O0-NEXT: v_lshlrev_b64 v[0:1], s4, v[0:1] -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1 -; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 killed $vgpr14_vgpr15 killed $exec -; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v15 -; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v14 -; GFX9-O0-NEXT: v_or_b32_e64 v0, v0, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v9 +; GFX9-O0-NEXT: v_add_co_u32_e64 v0, s[6:7], v0, v5 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v2, s[6:7], v1, v2, s[6:7] ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v9 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v0 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v14 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v15 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v4 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v12 @@ -1378,7 +1451,7 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_lshrrev_b64 v[3:4], s4, v[3:4] ; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $vgpr3_vgpr4 killed $exec ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: s_setpc_b64 s[30:31] @@ -1652,11 +1725,11 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_or_b32_e64 v0, v5, v4 ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 -; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0 +; GFX9-O0-NEXT: s_mov_b64 s[8:9], 0 ; GFX9-O0-NEXT: ; implicit-def: $vgpr30 : SGPR spill to VGPR lane -; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 0 -; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 1 -; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[0:1], s[6:7] +; GFX9-O0-NEXT: v_writelane_b32 v30, s8, 0 +; GFX9-O0-NEXT: v_writelane_b32 v30, s9, 1 +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[0:1], s[8:9] ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v13 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v10 ; GFX9-O0-NEXT: v_or_b32_e64 v7, v3, v1 @@ -1665,11 +1738,11 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_or_b32_e64 v9, v2, v0 ; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v10, v7 -; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[9:10], s[6:7] -; GFX9-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[6:7], v[9:10], s[8:9] +; GFX9-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] ; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v4 -; GFX9-O0-NEXT: s_mov_b32 s8, 32 -; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s8 +; GFX9-O0-NEXT: s_mov_b32 s6, 32 +; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s6 ; GFX9-O0-NEXT: v_ffbh_u32_e64 v6, v6 ; GFX9-O0-NEXT: v_min_u32_e64 v6, v4, v6 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, 0 @@ -1677,7 +1750,7 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v4 ; GFX9-O0-NEXT: v_mov_b32_e32 v10, v7 ; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v5 -; GFX9-O0-NEXT: v_add_u32_e64 v5, v5, s8 +; GFX9-O0-NEXT: v_add_u32_e64 v5, v5, s6 ; GFX9-O0-NEXT: v_ffbh_u32_e64 v8, v8 ; GFX9-O0-NEXT: v_min_u32_e64 v16, v5, v8 ; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec @@ -1686,14 +1759,14 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v8, v16 ; GFX9-O0-NEXT: s_mov_b32 s12, s10 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v17 -; GFX9-O0-NEXT: s_mov_b32 s9, s11 +; GFX9-O0-NEXT: s_mov_b32 s7, s11 ; GFX9-O0-NEXT: v_add_co_u32_e64 v8, s[12:13], v8, s12 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, s9 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, s7 ; GFX9-O0-NEXT: v_addc_co_u32_e64 v5, s[12:13], v5, v9, s[12:13] ; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v9, v5 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 -; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[6:7] +; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9] ; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[12:13], v[14:15], s[12:13] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v10, s[12:13] ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 @@ -1702,75 +1775,87 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v9, v5 ; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v0 -; GFX9-O0-NEXT: v_add_u32_e64 v5, v5, s8 +; GFX9-O0-NEXT: v_add_u32_e64 v5, v5, s6 ; GFX9-O0-NEXT: v_ffbh_u32_e64 v6, v1 ; GFX9-O0-NEXT: v_min_u32_e64 v5, v5, v6 ; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 ; GFX9-O0-NEXT: v_ffbh_u32_e64 v10, v2 -; GFX9-O0-NEXT: v_add_u32_e64 v10, v10, s8 +; GFX9-O0-NEXT: v_add_u32_e64 v10, v10, s6 ; GFX9-O0-NEXT: v_ffbh_u32_e64 v11, v3 ; GFX9-O0-NEXT: v_min_u32_e64 v14, v10, v11 ; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v15, v4 ; GFX9-O0-NEXT: v_mov_b32_e32 v10, v14 -; GFX9-O0-NEXT: s_mov_b32 s8, s10 +; GFX9-O0-NEXT: s_mov_b32 s6, s10 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v15 ; GFX9-O0-NEXT: s_mov_b32 s10, s11 -; GFX9-O0-NEXT: v_add_co_u32_e64 v10, s[8:9], v10, s8 +; GFX9-O0-NEXT: v_add_co_u32_e64 v10, s[6:7], v10, s6 ; GFX9-O0-NEXT: v_mov_b32_e32 v11, s10 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v4, s[8:9], v4, v11, s[8:9] +; GFX9-O0-NEXT: v_addc_co_u32_e64 v4, s[6:7], v4, v11, s[6:7] ; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v11, v4 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v11 -; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[8:9], v[12:13], s[8:9] -; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[8:9] +; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[6:7], v[12:13], s[6:7] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v10 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[8:9] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[6:7] ; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v8 ; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr5_vgpr6 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 -; GFX9-O0-NEXT: s_mov_b32 s10, s6 -; GFX9-O0-NEXT: s_mov_b32 s11, s7 +; GFX9-O0-NEXT: s_mov_b32 s7, s8 +; GFX9-O0-NEXT: s_mov_b32 s14, s9 ; GFX9-O0-NEXT: v_sub_co_u32_e32 v4, vcc, v4, v7 ; GFX9-O0-NEXT: v_subb_co_u32_e32 v8, vcc, v5, v6, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v6, s10 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, s10 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, s7 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s7 ; GFX9-O0-NEXT: v_subb_co_u32_e32 v7, vcc, v5, v6, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v6, s11 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, s14 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s14 ; GFX9-O0-NEXT: v_subb_co_u32_e32 v6, vcc, v5, v6, vcc ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8 -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6 -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7 +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[6:7] +; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v4 +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[10:11], v[7:8], s[10:11] ; GFX9-O0-NEXT: s_mov_b64 s[12:13], 0x7f -; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[12:13] -; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15] -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[6:7] -; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[14:15] -; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9] +; GFX9-O0-NEXT: s_mov_b64 s[16:17], s[12:13] +; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[16:17], v[4:5], s[16:17] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[16:17] +; GFX9-O0-NEXT: s_mov_b64 s[16:17], s[8:9] +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[16:17], v[7:8], s[16:17] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[16:17] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[10:11] ; GFX9-O0-NEXT: v_and_b32_e64 v6, 1, v6 -; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[8:9], v6, 1 -; GFX9-O0-NEXT: s_or_b64 s[8:9], s[4:5], s[8:9] -; GFX9-O0-NEXT: s_mov_b64 s[4:5], -1 -; GFX9-O0-NEXT: s_xor_b64 s[4:5], s[8:9], s[4:5] +; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[10:11], v6, 1 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[10:11] +; GFX9-O0-NEXT: s_mov_b32 s6, 1 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, s6 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[4:5] +; GFX9-O0-NEXT: v_and_b32_e64 v6, 1, v6 +; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], v6, 1 +; GFX9-O0-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 -; GFX9-O0-NEXT: s_mov_b32 s14, s13 -; GFX9-O0-NEXT: v_xor_b32_e64 v6, v6, s14 +; GFX9-O0-NEXT: s_mov_b32 s15, s13 +; GFX9-O0-NEXT: v_xor_b32_e64 v6, v6, s15 +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13 ; GFX9-O0-NEXT: v_xor_b32_e64 v4, v4, s12 ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec @@ -1783,20 +1868,29 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_or_b32_e64 v4, v4, v5 ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6 -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[6:7], v[4:5], s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v1, v4, s[8:9] -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[8:9] +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[4:5], s[8:9] +; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[10:11] +; GFX9-O0-NEXT: v_mov_b32_e32 v4, s14 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v1, v4, s[12:13] +; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[10:11] +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[12:13] ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v3, v4, s[8:9] -; GFX9-O0-NEXT: v_mov_b32_e32 v3, s10 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[8:9] +; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[10:11] +; GFX9-O0-NEXT: v_mov_b32_e32 v4, s14 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v3, v4, s[12:13] +; GFX9-O0-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[10:11] ; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v4 -; GFX9-O0-NEXT: s_and_b64 s[6:7], s[4:5], s[6:7] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[8:9] +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[4:5] +; GFX9-O0-NEXT: v_and_b32_e64 v4, 1, v4 +; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, 1 +; GFX9-O0-NEXT: s_mov_b64 s[6:7], -1 +; GFX9-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill @@ -1806,17 +1900,17 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_mov_b64 s[4:5], exec ; GFX9-O0-NEXT: v_writelane_b32 v30, s4, 2 ; GFX9-O0-NEXT: v_writelane_b32 v30, s5, 3 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O0-NEXT: s_cbranch_execz .LBB1_3 ; GFX9-O0-NEXT: s_branch .LBB1_8 ; GFX9-O0-NEXT: .LBB1_1: ; %Flow -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 4 ; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 5 @@ -1848,9 +1942,9 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB1_5 ; GFX9-O0-NEXT: .LBB1_3: ; %Flow2 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 2 ; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 3 @@ -1908,9 +2002,9 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB1_3 ; GFX9-O0-NEXT: .LBB1_5: ; %Flow1 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 6 ; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 7 @@ -1939,9 +2033,9 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_branch .LBB1_4 ; GFX9-O0-NEXT: .LBB1_6: ; %udiv-do-while ; GFX9-O0-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_readlane_b32 s6, v30, 8 ; GFX9-O0-NEXT: v_readlane_b32 s7, v30, 9 @@ -2123,9 +2217,9 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5] ; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 8 ; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 9 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill @@ -2154,9 +2248,9 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_cbranch_execnz .LBB1_6 ; GFX9-O0-NEXT: s_branch .LBB1_1 ; GFX9-O0-NEXT: .LBB1_7: ; %udiv-preheader -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload @@ -2194,13 +2288,17 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_lshrrev_b64 v[20:21], v5, v[14:15] ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v21 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v22, s[6:7] -; GFX9-O0-NEXT: s_mov_b32 s4, 0 -; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, s4 -; GFX9-O0-NEXT: v_mov_b32_e32 v22, v19 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v22, s[4:5] ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v20 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[6:7] +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7 +; GFX9-O0-NEXT: s_mov_b32 s4, 0 +; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, s4 +; GFX9-O0-NEXT: v_mov_b32_e32 v20, v19 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v20, s[4:5] +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v18 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[4:5] ; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec @@ -2248,9 +2346,9 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_writelane_b32 v30, s4, 8 ; GFX9-O0-NEXT: v_writelane_b32 v30, s5, 9 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill @@ -2277,78 +2375,89 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB1_6 ; GFX9-O0-NEXT: .LBB1_8: ; %udiv-bb1 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 1 ; GFX9-O0-NEXT: s_mov_b32 s5, s6 ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-O0-NEXT: s_mov_b32 s4, s7 +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1 ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0 ; GFX9-O0-NEXT: s_mov_b32 s8, s6 ; GFX9-O0-NEXT: s_mov_b32 s9, s7 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, s5 -; GFX9-O0-NEXT: v_add_co_u32_e32 v8, vcc, v3, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v4, s8 -; GFX9-O0-NEXT: v_addc_co_u32_e32 v0, vcc, v0, v4, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v4, s9 -; GFX9-O0-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v4, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-O0-NEXT: v_add_co_u32_e32 v8, vcc, v2, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s8 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v0, vcc, v0, v5, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s9 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc ; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v9, v1 ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v0 +; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b32 s4, 0x7f -; GFX9-O0-NEXT: v_sub_u32_e64 v2, s4, v3 +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_mov_b64 s[10:11], 0x7f +; GFX9-O0-NEXT: s_mov_b32 s5, s10 +; GFX9-O0-NEXT: s_mov_b32 s4, s11 +; GFX9-O0-NEXT: v_sub_co_u32_e32 v2, vcc, s5, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v4, vcc, v3, v4, vcc +; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v4 +; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $vgpr2_vgpr3 killed $exec ; GFX9-O0-NEXT: v_lshlrev_b64 v[4:5], v2, v[10:11] -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v5 -; GFX9-O0-NEXT: s_mov_b32 s4, 64 -; GFX9-O0-NEXT: v_sub_u32_e64 v13, s4, v2 -; GFX9-O0-NEXT: v_lshrrev_b64 v[13:14], v13, v[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v14 -; GFX9-O0-NEXT: v_or_b32_e64 v12, v12, v15 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v5 +; GFX9-O0-NEXT: s_mov_b32 s10, 64 +; GFX9-O0-NEXT: v_sub_u32_e64 v12, s10, v2 +; GFX9-O0-NEXT: v_lshrrev_b64 v[12:13], v12, v[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v13 +; GFX9-O0-NEXT: v_or_b32_e64 v3, v3, v14 ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v12 ; GFX9-O0-NEXT: v_or_b32_e64 v4, v4, v5 ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-O0-NEXT: v_mov_b32_e32 v14, v5 -; GFX9-O0-NEXT: v_cmp_lt_u32_e64 s[4:5], v2, s4 -; GFX9-O0-NEXT: s_mov_b32 s10, 63 -; GFX9-O0-NEXT: v_sub_u32_e64 v3, s10, v3 +; GFX9-O0-NEXT: v_cmp_lt_u32_e64 s[4:5], v2, s10 +; GFX9-O0-NEXT: v_sub_u32_e64 v3, v2, s10 ; GFX9-O0-NEXT: v_lshlrev_b64 v[12:13], v3, v[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v13 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v14, s[4:5] -; GFX9-O0-NEXT: s_mov_b32 s10, 0 -; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[10:11], v2, s10 -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v11 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v14, s[10:11] ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v4 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v12 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[4:5] +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v5 +; GFX9-O0-NEXT: s_mov_b32 s10, 0 +; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[10:11], v2, s10 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v11 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v12, s[10:11] +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v10 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[10:11] ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec @@ -2398,9 +2507,9 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] ; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 6 ; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 7 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O0-NEXT: s_cbranch_execz .LBB1_5 ; GFX9-O0-NEXT: s_branch .LBB1_7 @@ -2409,206 +2518,235 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b32 s4, 32 +; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 s[4:5], 0xffffffff +; GFX9-O0-NEXT: s_mov_b32 s6, s5 ; GFX9-O0-NEXT: s_waitcnt vmcnt(2) -; GFX9-O0-NEXT: v_lshrrev_b64 v[2:3], s4, v[4:5] -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v2 -; GFX9-O0-NEXT: s_waitcnt vmcnt(1) ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v13 -; GFX9-O0-NEXT: v_mul_lo_u32 v10, v6, v2 +; GFX9-O0-NEXT: v_and_b32_e64 v5, v2, s6 +; GFX9-O0-NEXT: s_mov_b32 s5, s4 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v12 +; GFX9-O0-NEXT: v_and_b32_e64 v3, v2, s5 +; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_lshrrev_b64 v[13:14], s4, v[13:14] -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v13 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v4 -; GFX9-O0-NEXT: v_mul_lo_u32 v3, v7, v3 -; GFX9-O0-NEXT: v_mad_u64_u32 v[4:5], s[6:7], v7, v2, 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v5 -; GFX9-O0-NEXT: v_add3_u32 v2, v2, v3, v10 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v11 +; GFX9-O0-NEXT: v_and_b32_e64 v6, v3, s6 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v10 +; GFX9-O0-NEXT: v_and_b32_e64 v3, v14, s5 +; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-O0-NEXT: v_mad_u64_u32 v[19:20], s[8:9], v4, v5, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v21, v19 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-O0-NEXT: ; kill: def $vgpr21 killed $vgpr21 def $vgpr21_vgpr22 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v22, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v22 +; GFX9-O0-NEXT: v_mov_b32_e32 v19, v20 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; implicit-def: $sgpr7 +; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v20, s4 +; GFX9-O0-NEXT: s_mov_b32 s4, 32 +; GFX9-O0-NEXT: v_lshlrev_b64 v[19:20], s4, v[19:20] +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v20 +; GFX9-O0-NEXT: v_or_b32_e64 v3, v3, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v21 +; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 killed $vgpr19_vgpr20 killed $exec +; GFX9-O0-NEXT: v_or_b32_e64 v19, v7, v19 +; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v20, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v20 +; GFX9-O0-NEXT: v_and_b32_e64 v3, v3, s6 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v19 +; GFX9-O0-NEXT: v_and_b32_e64 v23, v7, s5 +; GFX9-O0-NEXT: ; kill: def $vgpr23 killed $vgpr23 def $vgpr23_vgpr24 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v24, v3 +; GFX9-O0-NEXT: v_lshrrev_b64 v[19:20], s4, v[19:20] +; GFX9-O0-NEXT: v_lshrrev_b64 v[10:11], s4, v[10:11] +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v10 +; GFX9-O0-NEXT: v_mad_u64_u32 v[10:11], s[8:9], v3, v5, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v21, v10 +; GFX9-O0-NEXT: ; kill: def $vgpr21 killed $vgpr21 def $vgpr21_vgpr22 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v22, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v22 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v11 +; GFX9-O0-NEXT: ; implicit-def: $sgpr7 +; GFX9-O0-NEXT: ; implicit-def: $sgpr8 +; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v11, s7 +; GFX9-O0-NEXT: v_lshlrev_b64 v[10:11], s4, v[10:11] +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v11 +; GFX9-O0-NEXT: v_or_b32_e64 v5, v5, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v21 +; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 killed $vgpr10_vgpr11 killed $exec +; GFX9-O0-NEXT: v_or_b32_e64 v21, v7, v10 +; GFX9-O0-NEXT: ; kill: def $vgpr21 killed $vgpr21 def $vgpr21_vgpr22 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v22, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v21 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v19 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v22 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v20 +; GFX9-O0-NEXT: v_add_co_u32_e64 v10, s[8:9], v10, v11 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v5, s[8:9], v5, v7, s[8:9] +; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v11 +; GFX9-O0-NEXT: v_and_b32_e64 v5, v5, s6 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v10 +; GFX9-O0-NEXT: v_and_b32_e64 v19, v7, s5 +; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v20, v5 +; GFX9-O0-NEXT: v_lshrrev_b64 v[12:13], s4, v[12:13] +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v12 +; GFX9-O0-NEXT: v_mad_u64_u32 v[21:22], s[6:7], v4, v7, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v21 +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v21, v22 ; GFX9-O0-NEXT: ; implicit-def: $sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-O0-NEXT: v_lshlrev_b64 v[17:18], s4, v[2:3] -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v18 +; GFX9-O0-NEXT: ; kill: def $vgpr21 killed $vgpr21 def $vgpr21_vgpr22 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v22, s5 +; GFX9-O0-NEXT: v_lshlrev_b64 v[21:22], s4, v[21:22] +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v22 +; GFX9-O0-NEXT: v_or_b32_e64 v12, v12, v13 +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v21 +; GFX9-O0-NEXT: v_or_b32_e64 v4, v4, v5 +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v19 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v20 +; GFX9-O0-NEXT: v_add_co_u32_e64 v12, s[6:7], v12, v13 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v4, s[6:7], v4, v5, s[6:7] +; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v13, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v14 -; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v17 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v13 -; GFX9-O0-NEXT: v_or_b32_e64 v13, v3, v5 -; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v2 -; GFX9-O0-NEXT: v_lshrrev_b64 v[2:3], s4, v[15:16] -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v11 -; GFX9-O0-NEXT: v_mul_lo_u32 v3, v2, v10 -; GFX9-O0-NEXT: v_lshrrev_b64 v[11:12], s4, v[11:12] -; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 killed $vgpr11_vgpr12 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v15 -; GFX9-O0-NEXT: v_mul_lo_u32 v11, v11, v5 -; GFX9-O0-NEXT: v_mad_u64_u32 v[15:16], s[6:7], v2, v5, 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v16 -; GFX9-O0-NEXT: v_add3_u32 v2, v2, v3, v11 +; GFX9-O0-NEXT: v_lshlrev_b64 v[21:22], s4, v[12:13] +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v23 +; GFX9-O0-NEXT: v_mov_b32_e32 v20, v21 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v24 +; GFX9-O0-NEXT: v_mov_b32_e32 v19, v22 +; GFX9-O0-NEXT: v_add_co_u32_e64 v4, s[6:7], v4, v20 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v19, s[6:7], v5, v19, s[6:7] +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v19 +; GFX9-O0-NEXT: v_lshrrev_b64 v[19:20], s4, v[12:13] +; GFX9-O0-NEXT: v_lshrrev_b64 v[10:11], s4, v[10:11] +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v19 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v20 +; GFX9-O0-NEXT: v_add_co_u32_e64 v19, s[6:7], v12, v13 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v10, s[6:7], v10, v11, s[6:7] +; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v20, v10 +; GFX9-O0-NEXT: v_mad_u64_u32 v[21:22], s[6:7], v3, v7, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v21 +; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v21, v22 +; GFX9-O0-NEXT: ; implicit-def: $sgpr5 +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: ; kill: def $vgpr21 killed $vgpr21 def $vgpr21_vgpr22 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v22, s5 +; GFX9-O0-NEXT: v_lshlrev_b64 v[21:22], s4, v[21:22] +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v22 +; GFX9-O0-NEXT: v_or_b32_e64 v12, v12, v13 +; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 killed $vgpr10_vgpr11 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v21 +; GFX9-O0-NEXT: v_or_b32_e64 v10, v10, v11 +; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v19 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v20 +; GFX9-O0-NEXT: v_add_co_u32_e64 v12, s[6:7], v12, v13 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v10, s[6:7], v10, v11, s[6:7] +; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v10 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v17 +; GFX9-O0-NEXT: v_mul_lo_u32 v10, v7, v11 +; GFX9-O0-NEXT: v_lshrrev_b64 v[17:18], s4, v[17:18] +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v17 +; GFX9-O0-NEXT: v_mul_lo_u32 v7, v2, v7 +; GFX9-O0-NEXT: v_mad_u64_u32 v[17:18], s[6:7], v2, v11, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v18 +; GFX9-O0-NEXT: v_add3_u32 v10, v2, v7, v10 +; GFX9-O0-NEXT: ; implicit-def: $sgpr5 +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v11, s5 +; GFX9-O0-NEXT: v_lshlrev_b64 v[10:11], s4, v[10:11] +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v11 +; GFX9-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 killed $vgpr17_vgpr18 killed $exec +; GFX9-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v18 +; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v7 +; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 killed $vgpr10_vgpr11 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v17 +; GFX9-O0-NEXT: v_or_b32_e64 v10, v7, v10 +; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v15 +; GFX9-O0-NEXT: v_mul_lo_u32 v3, v2, v3 +; GFX9-O0-NEXT: v_lshrrev_b64 v[15:16], s4, v[15:16] +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v15 +; GFX9-O0-NEXT: v_mul_lo_u32 v7, v7, v14 +; GFX9-O0-NEXT: v_mad_u64_u32 v[14:15], s[6:7], v2, v14, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v15 +; GFX9-O0-NEXT: v_add3_u32 v2, v2, v3, v7 ; GFX9-O0-NEXT: ; implicit-def: $sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 ; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-O0-NEXT: v_lshlrev_b64 v[2:3], s4, v[2:3] -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v3 -; GFX9-O0-NEXT: ; kill: def $vgpr15 killed $vgpr15 killed $vgpr15_vgpr16 killed $exec -; GFX9-O0-NEXT: ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v16 -; GFX9-O0-NEXT: v_or_b32_e64 v11, v11, v12 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v15 -; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3 -; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v11 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v13 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v14 -; GFX9-O0-NEXT: v_add_co_u32_e64 v13, s[6:7], v11, v12 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v2, s[6:7], v2, v3, s[6:7] -; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v2 -; GFX9-O0-NEXT: v_mad_u64_u32 v[15:16], s[6:7], v10, v6, 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v15 -; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v16 -; GFX9-O0-NEXT: ; implicit-def: $sgpr5 -; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v16, s5 -; GFX9-O0-NEXT: v_lshlrev_b64 v[15:16], s4, v[15:16] -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v16 -; GFX9-O0-NEXT: v_or_b32_e64 v11, v11, v12 -; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $vgpr2_vgpr3 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v15 -; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3 -; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v11 -; GFX9-O0-NEXT: v_mad_u64_u32 v[15:16], s[6:7], v10, v7, 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v15 -; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v12 -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v16 -; GFX9-O0-NEXT: ; implicit-def: $sgpr5 -; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v16, s5 -; GFX9-O0-NEXT: v_lshlrev_b64 v[15:16], s4, v[15:16] -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v16 -; GFX9-O0-NEXT: v_or_b32_e64 v10, v10, v17 -; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 killed $vgpr11_vgpr12 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v15 -; GFX9-O0-NEXT: v_or_b32_e64 v19, v11, v12 -; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v20, v10 -; GFX9-O0-NEXT: v_mad_u64_u32 v[10:11], s[6:7], v5, v7, 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v11 -; GFX9-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v19 -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v17 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v20 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v18 -; GFX9-O0-NEXT: v_add_co_u32_e64 v15, s[6:7], v15, v16 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v7, s[6:7], v7, v12, s[6:7] -; GFX9-O0-NEXT: ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v16 -; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0xffffffff -; GFX9-O0-NEXT: s_mov_b32 s5, s7 -; GFX9-O0-NEXT: v_and_b32_e64 v7, v7, s5 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v15 -; GFX9-O0-NEXT: s_mov_b32 s5, s6 -; GFX9-O0-NEXT: v_and_b32_e64 v17, v12, s5 -; GFX9-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v7 -; GFX9-O0-NEXT: v_mad_u64_u32 v[19:20], s[6:7], v5, v6, 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v19 -; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v19, v20 -; GFX9-O0-NEXT: ; implicit-def: $sgpr5 -; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v20, s5 -; GFX9-O0-NEXT: v_lshlrev_b64 v[19:20], s4, v[19:20] -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v20 -; GFX9-O0-NEXT: v_or_b32_e64 v5, v5, v12 -; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v19 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 killed $vgpr14_vgpr15 killed $exec +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v15 ; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v7 -; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v17 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v18 -; GFX9-O0-NEXT: v_add_co_u32_e64 v5, s[6:7], v5, v12 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v7, s[6:7], v6, v7, s[6:7] -; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7 -; GFX9-O0-NEXT: v_lshrrev_b64 v[17:18], s4, v[5:6] -; GFX9-O0-NEXT: v_lshrrev_b64 v[19:20], s4, v[15:16] -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v19 -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v17 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v20 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v18 -; GFX9-O0-NEXT: v_add_co_u32_e64 v15, s[6:7], v15, v16 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v7, s[6:7], v7, v12, s[6:7] -; GFX9-O0-NEXT: ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v15 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v16 -; GFX9-O0-NEXT: v_add_co_u32_e64 v15, s[6:7], v7, v12 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v2, s[6:7], v2, v3, s[6:7] -; GFX9-O0-NEXT: ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v15 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v13 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v16 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v14 -; GFX9-O0-NEXT: v_add_co_u32_e64 v2, s[6:7], v2, v12 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v7, s[6:7], v3, v7, s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v14 +; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3 ; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v7 -; GFX9-O0-NEXT: v_lshlrev_b64 v[5:6], s4, v[5:6] -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 -; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 killed $vgpr10_vgpr11 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v10 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v11 +; GFX9-O0-NEXT: v_add_co_u32_e64 v10, s[6:7], v6, v7 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v2, s[6:7], v2, v3, s[6:7] ; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v11 -; GFX9-O0-NEXT: v_or_b32_e64 v4, v4, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v10 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11 +; GFX9-O0-NEXT: v_add_co_u32_e64 v2, s[6:7], v2, v7 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v6, s[6:7], v3, v6, s[6:7] +; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v4 ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v10 -; GFX9-O0-NEXT: v_or_b32_e64 v5, v5, v6 -; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5 -; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr5_vgpr6 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v8 @@ -2672,43 +2810,56 @@ define i128 @v_srem_i128_v_pow2k(i128 %lhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1 ; GFX9-O0-NEXT: v_ashrrev_i64 v[5:6], s4, v[5:6] -; GFX9-O0-NEXT: s_mov_b32 s4, 31 -; GFX9-O0-NEXT: v_lshrrev_b64 v[6:7], s4, v[5:6] +; GFX9-O0-NEXT: s_mov_b32 s5, 31 +; GFX9-O0-NEXT: v_lshrrev_b64 v[6:7], s5, v[5:6] ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v6 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v7 -; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0 -; GFX9-O0-NEXT: s_mov_b32 s5, s6 -; GFX9-O0-NEXT: s_mov_b32 s4, s7 -; GFX9-O0-NEXT: v_add_co_u32_e32 v6, vcc, v5, v4 -; GFX9-O0-NEXT: v_addc_co_u32_e32 v2, vcc, v0, v2, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v4, s5 -; GFX9-O0-NEXT: v_addc_co_u32_e32 v8, vcc, v3, v4, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-O0-NEXT: v_addc_co_u32_e32 v4, vcc, v1, v4, vcc +; GFX9-O0-NEXT: s_mov_b64 s[8:9], 0 +; GFX9-O0-NEXT: s_mov_b32 s6, s8 +; GFX9-O0-NEXT: s_mov_b32 s4, s9 +; GFX9-O0-NEXT: v_add_co_u32_e32 v8, vcc, v5, v4 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v4, vcc, v0, v2, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v6, vcc, v3, v2, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v2, vcc, v1, v2, vcc ; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v9, v4 ; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v7 -; GFX9-O0-NEXT: s_mov_b32 s6, -2 -; GFX9-O0-NEXT: s_mov_b32 s4, 0 -; GFX9-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 -; GFX9-O0-NEXT: s_mov_b32 s5, s6 -; GFX9-O0-NEXT: s_mov_b32 s6, s5 -; GFX9-O0-NEXT: v_and_b32_e64 v2, v2, s6 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v6 +; GFX9-O0-NEXT: v_lshlrev_b64 v[11:12], s5, v[10:11] +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v12 +; GFX9-O0-NEXT: s_mov_b32 s4, 33 +; GFX9-O0-NEXT: v_lshrrev_b64 v[9:10], s4, v[8:9] +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v10 +; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v9 +; GFX9-O0-NEXT: v_or_b32_e64 v8, v4, v8 +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v2 +; GFX9-O0-NEXT: v_ashrrev_i64 v[10:11], s4, v[6:7] +; GFX9-O0-NEXT: v_lshrrev_b64 v[6:7], s5, v[8:9] +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v7 +; GFX9-O0-NEXT: v_lshlrev_b64 v[10:11], s4, v[10:11] +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v11 +; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v4 +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v10 +; GFX9-O0-NEXT: v_or_b32_e64 v6, v4, v6 +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v2 +; GFX9-O0-NEXT: v_lshlrev_b64 v[8:9], s4, v[8:9] ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v6 -; GFX9-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 -; GFX9-O0-NEXT: v_and_b32_e64 v10, v4, s4 -; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v10 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v9 ; GFX9-O0-NEXT: v_sub_co_u32_e32 v5, vcc, v5, v7 ; GFX9-O0-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v6, vcc ; GFX9-O0-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v4, vcc @@ -2741,27 +2892,28 @@ define i128 @v_urem_i128_v_pow2k(i128 %lhs) { ; GFX9-O0-LABEL: v_urem_i128_v_pow2k: ; GFX9-O0: ; %bb.0: ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v1 -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec -; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr1 killed $exec -; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr4 killed $exec +; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-O0-NEXT: s_mov_b32 s6, 1 ; GFX9-O0-NEXT: s_mov_b32 s4, -1 ; GFX9-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GFX9-O0-NEXT: s_mov_b32 s5, s6 ; GFX9-O0-NEXT: s_mov_b32 s6, s5 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v1 -; GFX9-O0-NEXT: v_and_b32_e64 v3, v2, s6 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-O0-NEXT: v_and_b32_e64 v0, v0, s6 ; GFX9-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 -; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec -; GFX9-O0-NEXT: v_and_b32_e64 v1, v0, s4 +; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $vgpr1_vgpr2 killed $exec +; GFX9-O0-NEXT: v_and_b32_e64 v1, v1, s4 ; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-O0-NEXT: s_mov_b32 s4, 32 ; GFX9-O0-NEXT: v_lshrrev_b64 v[1:2], s4, v[1:2] ; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $vgpr1_vgpr2 killed $exec diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll index 47ebd072c4cc..5aa2cc713a4e 100644 --- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll +++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll @@ -7,15 +7,23 @@ define amdgpu_kernel void @respect_optnone(double %arg0, double %arg1, ptr addrs ; CHECK-LABEL: respect_optnone: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 ; CHECK-NEXT: s_mov_b32 s6, 0x3ff ; CHECK-NEXT: v_and_b32_e64 v0, v0, s6 +; CHECK-NEXT: v_ashrrev_i32_e64 v1, 31, v0 ; CHECK-NEXT: s_mov_b32 s6, 3 -; CHECK-NEXT: v_lshlrev_b32_e64 v0, s6, v0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_load_dwordx2 v[0:1], v0, s[4:5] +; CHECK-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; CHECK-NEXT: v_lshl_add_u64 v[0:1], v[0:1], s6, v[2:3] +; CHECK-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; CHECK-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; CHECK-NEXT: v_mov_b64_e32 v[4:5], s[2:3] ; CHECK-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/scalar_to_vector_v2x16.ll b/llvm/test/CodeGen/AMDGPU/scalar_to_vector_v2x16.ll index 808e60f655c4..9df94328bb6f 100644 --- a/llvm/test/CodeGen/AMDGPU/scalar_to_vector_v2x16.ll +++ b/llvm/test/CodeGen/AMDGPU/scalar_to_vector_v2x16.ll @@ -1,9 +1,9 @@ -; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefixes=GCN,OPT %s ; RUN: llc -mtriple=amdgcn -mcpu=fiji -O0 < %s | FileCheck -enable-var-scope -check-prefix=GCN %s ; GCN-LABEL: {{^}}scalar_to_vector_i16: -; GCN: v_mov_b32_e32 [[V:v[0-9]+]], 42 -; GCN: buffer_store_short [[V]], +; OPT: v_mov_b32_e32 [[V:v[0-9]+]], 42 +; OPT: buffer_store_short [[V]], define void @scalar_to_vector_i16() { %tmp = load <2 x i16>, ptr addrspace(5) poison %tmp1 = insertelement <2 x i16> %tmp, i16 42, i64 0 @@ -12,8 +12,8 @@ define void @scalar_to_vector_i16() { } ; GCN-LABEL: {{^}}scalar_to_vector_f16: -; GCN: v_mov_b32_e32 [[V:v[0-9]+]], 0x3c00 -; GCN: buffer_store_short [[V]], +; OPT: v_mov_b32_e32 [[V:v[0-9]+]], 0x3c00 +; OPT: buffer_store_short [[V]], define void @scalar_to_vector_f16() { %tmp = load <2 x half>, ptr addrspace(5) poison %tmp1 = insertelement <2 x half> %tmp, half 1.0, i64 0 diff --git a/llvm/test/CodeGen/AMDGPU/select-phi-s16-fp.ll b/llvm/test/CodeGen/AMDGPU/select-phi-s16-fp.ll index 563e95f7f55b..a3f1bf8e5843 100644 --- a/llvm/test/CodeGen/AMDGPU/select-phi-s16-fp.ll +++ b/llvm/test/CodeGen/AMDGPU/select-phi-s16-fp.ll @@ -21,6 +21,7 @@ define void @phi_vec1half_to_f32_with_const_folding(ptr addrspace(1) %dst) #0 { ; CHECK-NEXT: s_mov_b32 s9, s10 ; CHECK-NEXT: v_mov_b32_e32 v2, s4 ; CHECK-NEXT: buffer_store_short v2, v[0:1], s[8:11], 0 addr64 offset:2 +; CHECK-NEXT: buffer_store_short v0, v[0:1], s[8:11], 0 addr64 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -53,6 +54,7 @@ define void @phi_vec1half_to_f32(ptr addrspace(1) %src, ptr addrspace(1) %dst) # ; CHECK-NEXT: s_mov_b32 s5, s6 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64 offset:2 +; CHECK-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -76,8 +78,10 @@ define void @phi_vec1bf16_to_f32(ptr addrspace(1) %src, ptr addrspace(1) %dst) # ; CHECK-NEXT: s_mov_b32 s4, s6 ; CHECK-NEXT: s_mov_b32 s5, s6 ; CHECK-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 -; CHECK-NEXT: s_mov_b32 s4, 16 +; CHECK-NEXT: s_mov_b32 s4, 0xffff ; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_and_b32_e64 v0, v0, s4 +; CHECK-NEXT: s_mov_b32 s4, 16 ; CHECK-NEXT: v_lshlrev_b32_e64 v0, s4, v0 ; CHECK-NEXT: ; %bb.1: ; %bb ; CHECK-NEXT: v_mul_f32_e64 v0, 1.0, v0 @@ -88,6 +92,7 @@ define void @phi_vec1bf16_to_f32(ptr addrspace(1) %src, ptr addrspace(1) %dst) # ; CHECK-NEXT: s_mov_b32 s4, s6 ; CHECK-NEXT: s_mov_b32 s5, s6 ; CHECK-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64 offset:2 +; CHECK-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -116,6 +121,7 @@ define void @phi_vec1bf16_to_f32_with_const_folding(ptr addrspace(1) %dst) #0 { ; CHECK-NEXT: s_mov_b32 s4, s6 ; CHECK-NEXT: s_mov_b32 s5, s6 ; CHECK-NEXT: buffer_store_short v2, v[0:1], s[4:7], 0 addr64 offset:2 +; CHECK-NEXT: buffer_store_short v0, v[0:1], s[4:7], 0 addr64 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll index fcf2aa448e42..cd36efa19162 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll @@ -9,6 +9,10 @@ define amdgpu_kernel void @partial_no_vgprs_last_sgpr_spill(ptr addrspace(1) %ou ; GCN: ; %bb.0: ; GCN-NEXT: s_add_u32 s0, s0, s17 ; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_load_dword s4, s[8:9], 0x2 +; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_load_dword s4, s[8:9], 0x2 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND @@ -109,8 +113,12 @@ define amdgpu_kernel void @partial_no_vgprs_last_sgpr_spill(ptr addrspace(1) %ou ; GCN-NEXT: s_mov_b64 exec, s[24:25] ; GCN-NEXT: s_mov_b32 s5, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s4, s5 -; GCN-NEXT: s_cbranch_scc1 .LBB0_2 +; GCN-NEXT: s_cmp_eq_u32 s4, s5 +; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN-NEXT: s_mov_b64 s[6:7], -1 +; GCN-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7] +; GCN-NEXT: s_and_b64 vcc, exec, s[4:5] +; GCN-NEXT: s_cbranch_vccnz .LBB0_2 ; GCN-NEXT: ; %bb.1: ; %bb0 ; GCN-NEXT: s_or_saveexec_b64 s[24:25], -1 ; GCN-NEXT: buffer_load_dword v23, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll index 702953c56a5c..979788c9231d 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll @@ -726,16 +726,20 @@ define void @spill_sgpr_with_sgpr_uses() #0 { ; GCN-NEXT: s_mov_b32 s5, s4 ; GCN-NEXT: ; implicit-def: $vgpr254 : SGPR spill to VGPR lane ; GCN-NEXT: v_writelane_b32 v254, s5, 0 -; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GCN-NEXT: s_or_saveexec_b64 s[10:11], -1 ; GCN-NEXT: buffer_store_dword v254, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[8:9] +; GCN-NEXT: s_mov_b64 exec, s[10:11] ; GCN-NEXT: s_mov_b32 s5, 0 -; GCN-NEXT: s_cmp_lg_u32 s4, s5 -; GCN-NEXT: s_cbranch_scc1 .LBB3_2 +; GCN-NEXT: s_cmp_eq_u32 s4, s5 +; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN-NEXT: s_mov_b64 s[6:7], -1 +; GCN-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7] +; GCN-NEXT: s_and_b64 vcc, exec, s[4:5] +; GCN-NEXT: s_cbranch_vccnz .LBB3_2 ; GCN-NEXT: ; %bb.1: ; %bb0 -; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GCN-NEXT: s_or_saveexec_b64 s[10:11], -1 ; GCN-NEXT: buffer_load_dword v254, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[8:9] +; GCN-NEXT: s_mov_b64 exec, s[10:11] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_readlane_b32 s4, v254, 0 ; GCN-NEXT: ;;#ASMSTART diff --git a/llvm/test/CodeGen/AMDGPU/smfmac_alloc_failure_no_agpr_O0.ll b/llvm/test/CodeGen/AMDGPU/smfmac_alloc_failure_no_agpr_O0.ll index ba0fdc689b4f..6ec5dbc16baf 100644 --- a/llvm/test/CodeGen/AMDGPU/smfmac_alloc_failure_no_agpr_O0.ll +++ b/llvm/test/CodeGen/AMDGPU/smfmac_alloc_failure_no_agpr_O0.ll @@ -7,43 +7,86 @@ declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.f16(<8 x half>, <16 x half define amdgpu_kernel void @test_smfmac_f32_32x32x32_f16__vgpr(ptr addrspace(1) %arg, <8 x half> %a, <16 x half> %b, i32 %idx) #0 { ; CHECK-LABEL: test_smfmac_f32_32x32x32_f16__vgpr: ; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_mov_b64 s[2:3], s[4:5] ; CHECK-NEXT: v_mov_b32_e32 v1, v0 -; CHECK-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; CHECK-NEXT: s_load_dwordx4 s[12:15], s[2:3], 0x34 -; CHECK-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: s_load_dword s2, s[2:3], 0x64 -; CHECK-NEXT: s_mov_b32 s3, 0x3ff -; CHECK-NEXT: v_and_b32_e64 v1, v1, s3 -; CHECK-NEXT: s_mov_b32 s3, 6 -; CHECK-NEXT: v_lshlrev_b32_e64 v8, s3, v1 +; CHECK-NEXT: s_load_dword s0, s[4:5], 0x64 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_load_dwordx4 v[4:7], v8, s[0:1] offset:48 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; CHECK-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x34 +; CHECK-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 +; CHECK-NEXT: s_load_dword s2, s[4:5], 0x64 +; CHECK-NEXT: s_mov_b32 s3, 0x3ff +; CHECK-NEXT: v_and_b32_e64 v2, v1, s3 +; CHECK-NEXT: v_ashrrev_i32_e64 v1, 31, v2 +; CHECK-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; CHECK-NEXT: v_mov_b32_e32 v3, v1 +; CHECK-NEXT: s_mov_b32 s3, 6 +; CHECK-NEXT: v_lshlrev_b64 v[2:3], s3, v[2:3] +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_lshl_add_u64 v[2:3], s[0:1], 0, v[2:3] +; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v1, v7 -; CHECK-NEXT: v_mov_b32_e32 v2, v6 -; CHECK-NEXT: v_mov_b32_e32 v3, v5 +; CHECK-NEXT: v_mov_b32_e32 v12, v6 +; CHECK-NEXT: v_mov_b32_e32 v13, v5 +; CHECK-NEXT: v_mov_b32_e32 v14, v4 +; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v15, v7 +; CHECK-NEXT: v_mov_b32_e32 v16, v6 +; CHECK-NEXT: v_mov_b32_e32 v17, v5 ; CHECK-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5_vgpr6_vgpr7 killed $exec -; CHECK-NEXT: global_load_dwordx4 v[10:13], v8, s[0:1] offset:32 +; CHECK-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 killed $exec +; CHECK-NEXT: v_mov_b32_e32 v5, v17 +; CHECK-NEXT: v_mov_b32_e32 v6, v16 +; CHECK-NEXT: v_mov_b32_e32 v7, v15 +; CHECK-NEXT: v_mov_b32_e32 v8, v14 +; CHECK-NEXT: v_mov_b32_e32 v9, v13 +; CHECK-NEXT: v_mov_b32_e32 v10, v12 +; CHECK-NEXT: v_mov_b32_e32 v11, v1 +; CHECK-NEXT: v_mov_b32_e32 v25, v11 +; CHECK-NEXT: v_mov_b32_e32 v26, v10 +; CHECK-NEXT: v_mov_b32_e32 v27, v9 +; CHECK-NEXT: v_mov_b32_e32 v28, v8 +; CHECK-NEXT: v_mov_b32_e32 v29, v7 +; CHECK-NEXT: v_mov_b32_e32 v30, v6 +; CHECK-NEXT: v_mov_b32_e32 v31, v5 +; CHECK-NEXT: v_mov_b32_e32 v8, v4 +; CHECK-NEXT: global_load_dwordx4 v[10:13], v[2:3], off offset:32 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v5, v13 ; CHECK-NEXT: v_mov_b32_e32 v6, v12 ; CHECK-NEXT: v_mov_b32_e32 v7, v11 +; CHECK-NEXT: ; kill: def $vgpr10 killed $vgpr10 killed $vgpr10_vgpr11_vgpr12_vgpr13 killed $exec +; CHECK-NEXT: s_mov_b64 s[6:7], 32 +; CHECK-NEXT: v_lshl_add_u64 v[2:3], v[2:3], 0, s[6:7] +; CHECK-NEXT: global_load_dwordx4 v[12:15], v[2:3], off offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v1, v15 +; CHECK-NEXT: v_mov_b32_e32 v2, v14 +; CHECK-NEXT: v_mov_b32_e32 v3, v13 +; CHECK-NEXT: v_mov_b32_e32 v4, v12 +; CHECK-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 killed $exec +; CHECK-NEXT: v_mov_b32_e32 v11, v7 +; CHECK-NEXT: v_mov_b32_e32 v12, v6 +; CHECK-NEXT: v_mov_b32_e32 v13, v5 +; CHECK-NEXT: v_mov_b32_e32 v14, v4 +; CHECK-NEXT: v_mov_b32_e32 v15, v3 +; CHECK-NEXT: v_mov_b32_e32 v16, v2 +; CHECK-NEXT: v_mov_b32_e32 v17, v1 +; CHECK-NEXT: v_mov_b32_e32 v1, v17 +; CHECK-NEXT: v_mov_b32_e32 v2, v16 +; CHECK-NEXT: v_mov_b32_e32 v3, v15 +; CHECK-NEXT: v_mov_b32_e32 v4, v14 +; CHECK-NEXT: v_mov_b32_e32 v5, v13 +; CHECK-NEXT: v_mov_b32_e32 v6, v12 +; CHECK-NEXT: v_mov_b32_e32 v7, v11 ; CHECK-NEXT: v_mov_b32_e32 v24, v10 -; CHECK-NEXT: global_load_dwordx4 v[10:13], v8, s[0:1] offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v25, v13 -; CHECK-NEXT: v_mov_b32_e32 v26, v12 -; CHECK-NEXT: v_mov_b32_e32 v27, v11 -; CHECK-NEXT: v_mov_b32_e32 v28, v10 -; CHECK-NEXT: global_load_dwordx4 v[8:11], v8, s[0:1] -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v29, v11 -; CHECK-NEXT: v_mov_b32_e32 v30, v10 -; CHECK-NEXT: v_mov_b32_e32 v31, v9 -; CHECK-NEXT: ; kill: def $vgpr8 killed $vgpr8 killed $vgpr8_vgpr9_vgpr10_vgpr11 killed $exec ; CHECK-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 killed $exec ; CHECK-NEXT: v_mov_b32_e32 v9, v31 ; CHECK-NEXT: v_mov_b32_e32 v10, v30 @@ -60,12 +103,12 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x32_f16__vgpr(ptr addrspace(1) % ; CHECK-NEXT: v_mov_b32_e32 v21, v3 ; CHECK-NEXT: v_mov_b32_e32 v22, v2 ; CHECK-NEXT: v_mov_b32_e32 v23, v1 -; CHECK-NEXT: v_mov_b64_e32 v[2:3], s[12:13] -; CHECK-NEXT: v_mov_b64_e32 v[4:5], s[14:15] -; CHECK-NEXT: v_mov_b64_e32 v[30:31], s[10:11] -; CHECK-NEXT: v_mov_b64_e32 v[28:29], s[8:9] -; CHECK-NEXT: v_mov_b64_e32 v[26:27], s[6:7] -; CHECK-NEXT: v_mov_b64_e32 v[24:25], s[4:5] +; CHECK-NEXT: v_mov_b64_e32 v[2:3], s[16:17] +; CHECK-NEXT: v_mov_b64_e32 v[4:5], s[18:19] +; CHECK-NEXT: v_mov_b64_e32 v[30:31], s[14:15] +; CHECK-NEXT: v_mov_b64_e32 v[28:29], s[12:13] +; CHECK-NEXT: v_mov_b64_e32 v[26:27], s[10:11] +; CHECK-NEXT: v_mov_b64_e32 v[24:25], s[8:9] ; CHECK-NEXT: v_mov_b32_e32 v1, s2 ; CHECK-NEXT: s_nop 1 ; CHECK-NEXT: v_smfmac_f32_32x32x32_f16 v[8:23], v[2:5], v[24:31], v1 cbsz:1 abid:2 @@ -78,7 +121,15 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x32_f16__vgpr(ptr addrspace(1) % ; CHECK-NEXT: v_mov_b32_e32 v3, v7 ; CHECK-NEXT: v_mov_b32_e32 v4, v6 ; CHECK-NEXT: v_mov_b32_e32 v5, v1 -; CHECK-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:48 +; CHECK-NEXT: s_mov_b32 s2, s0 +; CHECK-NEXT: s_mov_b32 s3, s1 +; CHECK-NEXT: s_mov_b32 s5, s6 +; CHECK-NEXT: s_mov_b32 s4, s7 +; CHECK-NEXT: s_add_u32 s2, s2, s5 +; CHECK-NEXT: s_addc_u32 s4, s3, s4 +; CHECK-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; CHECK-NEXT: s_mov_b32 s3, s4 +; CHECK-NEXT: global_store_dwordx4 v0, v[2:5], s[2:3] offset:16 ; CHECK-NEXT: v_mov_b32_e32 v1, v19 ; CHECK-NEXT: v_mov_b32_e32 v6, v18 ; CHECK-NEXT: v_mov_b32_e32 v7, v17 diff --git a/llvm/test/CodeGen/AMDGPU/sopk-no-literal.ll b/llvm/test/CodeGen/AMDGPU/sopk-no-literal.ll index a0ef3000f16d..a84391d75cae 100644 --- a/llvm/test/CodeGen/AMDGPU/sopk-no-literal.ll +++ b/llvm/test/CodeGen/AMDGPU/sopk-no-literal.ll @@ -2,10 +2,9 @@ ; RUN: llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -debug-only=branch-relaxation < %s 2>&1 | FileCheck --check-prefix=GFX10 %s ; GFX10: Basic blocks after relaxation -; GFX10: %bb.0 offset=00000000 size=0x1c +; GFX10: %bb.0 offset=00000000 size=0x28 -; Each instruction in the following kernel is 4 bytes in size, -; except s_load_b32 which is 8 bytes in size. Hence, 0x1c bytes in total. +; At -O0 without DAG combines, more instructions are generated. define amdgpu_kernel void @test_sopk_size(i32 %var.mode) { ; GFX10-LABEL: test_sopk_size: ; GFX10: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/spill-m0.ll b/llvm/test/CodeGen/AMDGPU/spill-m0.ll index 3e4dbbd2f11f..c53820e43f5a 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-m0.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-m0.ll @@ -20,7 +20,7 @@ ; TOVMEM: buffer_store_dword [[SPILL_VREG]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 ; 4-byte Folded Spill ; TOVMEM: s_mov_b64 exec, [[COPY_EXEC]] -; GCN: s_cbranch_scc1 [[ENDIF:.LBB[0-9]+_[0-9]+]] +; GCN: s_cbranch_vccnz [[ENDIF:.LBB[0-9]+_[0-9]+]] ; GCN: [[ENDIF]]: ; TOVGPR: v_readlane_b32 [[M0_RESTORE:s[0-9]+]], [[SPILL_VREG]], [[M0_LANE]] diff --git a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll index 586579fcaeb9..47b976abf12f 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll @@ -25,15 +25,18 @@ define void @test() { ; CHECK-NEXT: v_readfirstlane_b32 s6, v0 ; CHECK-NEXT: s_mov_b64 s[4:5], -1 ; CHECK-NEXT: s_mov_b32 s7, 0 -; CHECK-NEXT: s_cmp_eq_u32 s6, s7 +; CHECK-NEXT: s_cmp_lg_u32 s6, s7 +; CHECK-NEXT: s_cselect_b64 s[6:7], -1, 0 +; CHECK-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5] +; CHECK-NEXT: s_and_b64 vcc, exec, s[6:7] ; CHECK-NEXT: ; implicit-def: $vgpr1 : SGPR spill to VGPR lane ; CHECK-NEXT: v_writelane_b32 v1, s4, 0 ; CHECK-NEXT: v_writelane_b32 v1, s5, 1 -; CHECK-NEXT: s_mov_b64 s[10:11], exec -; CHECK-NEXT: s_mov_b64 exec, -1 +; CHECK-NEXT: s_or_saveexec_b64 s[10:11], -1 +; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: v_accvgpr_write_b32 a0, v1 ; Reload Reuse ; CHECK-NEXT: s_mov_b64 exec, s[10:11] -; CHECK-NEXT: s_cbranch_scc1 .LBB0_5 +; CHECK-NEXT: s_cbranch_vccnz .LBB0_5 ; CHECK-NEXT: ; %bb.4: ; %bb.4 ; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1 ; CHECK-NEXT: s_or_saveexec_b64 s[10:11], -1 @@ -54,9 +57,8 @@ define void @test() { ; CHECK-NEXT: s_mov_b64 exec, s[10:11] ; CHECK-NEXT: v_readlane_b32 s4, v1, 0 ; CHECK-NEXT: v_readlane_b32 s5, v1, 1 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] -; CHECK-NEXT: s_mov_b32 s4, 1 -; CHECK-NEXT: v_cmp_ne_u32_e64 s[4:5], v0, s4 +; CHECK-NEXT: s_mov_b64 s[6:7], -1 +; CHECK-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7] ; CHECK-NEXT: s_and_b64 vcc, exec, s[4:5] ; CHECK-NEXT: s_cbranch_vccnz .LBB0_1 ; CHECK-NEXT: ; %bb.6: ; %bb.5 diff --git a/llvm/test/CodeGen/AMDGPU/spill-wide-sgpr.ll b/llvm/test/CodeGen/AMDGPU/spill-wide-sgpr.ll index e8e8385464f3..a6a8e630ed47 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-wide-sgpr.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-wide-sgpr.ll @@ -5,14 +5,14 @@ ; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 0 ; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 1 -; VGPR: s_cbranch_scc1 +; VGPR: s_cbranch_vccnz ; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 0 ; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 1 ; VMEM: buffer_store_dword -; VMEM: s_cbranch_scc1 +; VMEM: s_cbranch_vccnz ; VMEM: buffer_load_dword define amdgpu_kernel void @spill_sgpr_x2(ptr addrspace(1) %out, i32 %in) #0 { @@ -33,7 +33,7 @@ ret: ; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 0 ; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 1 ; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 2 -; VGPR: s_cbranch_scc1 +; VGPR: s_cbranch_vccnz ; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 0 ; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 1 @@ -41,7 +41,7 @@ ret: ; VMEM: buffer_store_dword -; VMEM: s_cbranch_scc1 +; VMEM: s_cbranch_vccnz ; VMEM: buffer_load_dword define amdgpu_kernel void @spill_sgpr_x3(ptr addrspace(1) %out, i32 %in) #0 { @@ -63,7 +63,7 @@ ret: ; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 1 ; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 2 ; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 3 -; VGPR: s_cbranch_scc1 +; VGPR: s_cbranch_vccnz ; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 0 ; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 1 @@ -72,7 +72,7 @@ ret: ; VMEM: buffer_store_dword -; VMEM: s_cbranch_scc1 +; VMEM: s_cbranch_vccnz ; VMEM: buffer_load_dword define amdgpu_kernel void @spill_sgpr_x4(ptr addrspace(1) %out, i32 %in) #0 { @@ -95,7 +95,7 @@ ret: ; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 2 ; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 3 ; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 4 -; VGPR: s_cbranch_scc1 +; VGPR: s_cbranch_vccnz ; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 0 ; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 1 @@ -105,7 +105,7 @@ ret: ; VMEM: buffer_store_dword -; VMEM: s_cbranch_scc1 +; VMEM: s_cbranch_vccnz ; VMEM: buffer_load_dword define amdgpu_kernel void @spill_sgpr_x5(ptr addrspace(1) %out, i32 %in) #0 { @@ -131,7 +131,7 @@ ret: ; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 5 ; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 6 ; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 7 -; VGPR: s_cbranch_scc1 +; VGPR: s_cbranch_vccnz ; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 0 ; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 1 @@ -143,7 +143,7 @@ ret: ; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 7 ; VMEM: buffer_store_dword -; VMEM: s_cbranch_scc1 +; VMEM: s_cbranch_vccnz ; VMEM: buffer_load_dword define amdgpu_kernel void @spill_sgpr_x8(ptr addrspace(1) %out, i32 %in) #0 { @@ -177,7 +177,7 @@ ret: ; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 13 ; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 14 ; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 15 -; VGPR: s_cbranch_scc1 +; VGPR: s_cbranch_vccnz ; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 0 ; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 1 @@ -197,7 +197,7 @@ ret: ; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 15 ; VMEM: buffer_store_dword -; VMEM: s_cbranch_scc1 +; VMEM: s_cbranch_vccnz ; VMEM: buffer_load_dword define amdgpu_kernel void @spill_sgpr_x16(ptr addrspace(1) %out, i32 %in) #0 { @@ -247,7 +247,7 @@ ret: ; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 29 ; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 30 ; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 31 -; VGPR: s_cbranch_scc1 +; VGPR: s_cbranch_vccnz ; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 0 ; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 1 @@ -283,7 +283,7 @@ ret: ; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 31 ; VMEM: buffer_store_dword -; VMEM: s_cbranch_scc1 +; VMEM: s_cbranch_vccnz ; VMEM: buffer_load_dword define amdgpu_kernel void @spill_sgpr_x32(ptr addrspace(1) %out, i32 %in) #0 { diff --git a/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll b/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll index 838ecf9afff2..a4aa8d55106e 100644 --- a/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll +++ b/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll @@ -10,10 +10,39 @@ declare ptr addrspace(5) @llvm.stacksave.p5() declare void @llvm.stackrestore.p5(ptr addrspace(5)) define hidden void @stack_passed_argument([32 x i32], i32) { -; GCN-LABEL: stack_passed_argument: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; WAVE32-OPT-LABEL: stack_passed_argument: +; WAVE32-OPT: ; %bb.0: +; WAVE32-OPT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; WAVE32-OPT-NEXT: s_setpc_b64 s[30:31] +; +; WAVE64-OPT-LABEL: stack_passed_argument: +; WAVE64-OPT: ; %bb.0: +; WAVE64-OPT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; WAVE64-OPT-NEXT: s_setpc_b64 s[30:31] +; +; WAVE32-O0-LABEL: stack_passed_argument: +; WAVE32-O0: ; %bb.0: +; WAVE32-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; WAVE32-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 +; WAVE32-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; WAVE32-O0-NEXT: s_waitcnt vmcnt(0) +; WAVE32-O0-NEXT: s_setpc_b64 s[30:31] +; +; WAVE64-O0-LABEL: stack_passed_argument: +; WAVE64-O0: ; %bb.0: +; WAVE64-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; WAVE64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 +; WAVE64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; WAVE64-O0-NEXT: s_waitcnt vmcnt(0) +; WAVE64-O0-NEXT: s_setpc_b64 s[30:31] +; +; WAVE32-WWM-PREALLOC-LABEL: stack_passed_argument: +; WAVE32-WWM-PREALLOC: ; %bb.0: +; WAVE32-WWM-PREALLOC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; WAVE32-WWM-PREALLOC-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 +; WAVE32-WWM-PREALLOC-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; WAVE32-WWM-PREALLOC-NEXT: s_waitcnt vmcnt(0) +; WAVE32-WWM-PREALLOC-NEXT: s_setpc_b64 s[30:31] ret void } @@ -810,6 +839,8 @@ define amdgpu_kernel void @kernel_stacksave_sgpr(ptr addrspace(5) %stack) { ; WAVE32-O0: ; %bb.0: ; WAVE32-O0-NEXT: s_load_dword s0, s[4:5], 0x0 ; WAVE32-O0-NEXT: s_waitcnt lgkmcnt(0) +; WAVE32-O0-NEXT: s_load_dword s0, s[4:5], 0x0 +; WAVE32-O0-NEXT: s_waitcnt lgkmcnt(0) ; WAVE32-O0-NEXT: s_mov_b32 s1, s0 ; WAVE32-O0-NEXT: ;;#ASMSTART ; WAVE32-O0-NEXT: ; use s1 @@ -822,6 +853,8 @@ define amdgpu_kernel void @kernel_stacksave_sgpr(ptr addrspace(5) %stack) { ; WAVE64-O0: ; %bb.0: ; WAVE64-O0-NEXT: s_load_dword s0, s[4:5], 0x0 ; WAVE64-O0-NEXT: s_waitcnt lgkmcnt(0) +; WAVE64-O0-NEXT: s_load_dword s0, s[4:5], 0x0 +; WAVE64-O0-NEXT: s_waitcnt lgkmcnt(0) ; WAVE64-O0-NEXT: s_mov_b32 s1, s0 ; WAVE64-O0-NEXT: ;;#ASMSTART ; WAVE64-O0-NEXT: ; use s1 @@ -834,6 +867,8 @@ define amdgpu_kernel void @kernel_stacksave_sgpr(ptr addrspace(5) %stack) { ; WAVE32-WWM-PREALLOC: ; %bb.0: ; WAVE32-WWM-PREALLOC-NEXT: s_load_dword s0, s[4:5], 0x0 ; WAVE32-WWM-PREALLOC-NEXT: s_waitcnt lgkmcnt(0) +; WAVE32-WWM-PREALLOC-NEXT: s_load_dword s0, s[4:5], 0x0 +; WAVE32-WWM-PREALLOC-NEXT: s_waitcnt lgkmcnt(0) ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s1, s0 ; WAVE32-WWM-PREALLOC-NEXT: ;;#ASMSTART ; WAVE32-WWM-PREALLOC-NEXT: ; use s1 @@ -950,6 +985,9 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects ; WAVE32-O0-NEXT: s_mov_b32 s15, s32 ; WAVE32-O0-NEXT: v_mov_b32_e32 v3, 17 ; WAVE32-O0-NEXT: buffer_store_dword v3, off, s[20:23], s15 offset:4 +; WAVE32-O0-NEXT: ; implicit-def: $sgpr16 +; WAVE32-O0-NEXT: v_mov_b32_e32 v3, s16 +; WAVE32-O0-NEXT: buffer_store_dword v3, off, s[20:23], s15 ; WAVE32-O0-NEXT: s_mov_b32 s15, stack_passed_argument@abs32@hi ; WAVE32-O0-NEXT: s_mov_b32 s16, stack_passed_argument@abs32@lo ; WAVE32-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17 @@ -1060,6 +1098,9 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects ; WAVE64-O0-NEXT: s_mov_b32 s15, s32 ; WAVE64-O0-NEXT: v_mov_b32_e32 v3, 17 ; WAVE64-O0-NEXT: buffer_store_dword v3, off, s[24:27], s15 offset:4 +; WAVE64-O0-NEXT: ; implicit-def: $sgpr16 +; WAVE64-O0-NEXT: v_mov_b32_e32 v3, s16 +; WAVE64-O0-NEXT: buffer_store_dword v3, off, s[24:27], s15 ; WAVE64-O0-NEXT: s_mov_b32 s15, stack_passed_argument@abs32@hi ; WAVE64-O0-NEXT: s_mov_b32 s16, stack_passed_argument@abs32@lo ; WAVE64-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17 @@ -1171,6 +1212,9 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s15, s32 ; WAVE32-WWM-PREALLOC-NEXT: v_mov_b32_e32 v3, 17 ; WAVE32-WWM-PREALLOC-NEXT: buffer_store_dword v3, off, s[20:23], s15 offset:4 +; WAVE32-WWM-PREALLOC-NEXT: ; implicit-def: $sgpr16 +; WAVE32-WWM-PREALLOC-NEXT: v_mov_b32_e32 v3, s16 +; WAVE32-WWM-PREALLOC-NEXT: buffer_store_dword v3, off, s[20:23], s15 ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s15, stack_passed_argument@abs32@hi ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s16, stack_passed_argument@abs32@lo ; WAVE32-WWM-PREALLOC-NEXT: ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17 @@ -1356,6 +1400,9 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() { ; WAVE32-O0-NEXT: s_mov_b32 s16, s32 ; WAVE32-O0-NEXT: v_mov_b32_e32 v0, 17 ; WAVE32-O0-NEXT: buffer_store_dword v0, off, s[0:3], s16 offset:4 +; WAVE32-O0-NEXT: ; implicit-def: $sgpr17 +; WAVE32-O0-NEXT: v_mov_b32_e32 v0, s17 +; WAVE32-O0-NEXT: buffer_store_dword v0, off, s[0:3], s16 ; WAVE32-O0-NEXT: s_mov_b32 s18, stack_passed_argument@abs32@hi ; WAVE32-O0-NEXT: s_mov_b32 s16, stack_passed_argument@abs32@lo ; WAVE32-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17 @@ -1467,6 +1514,9 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() { ; WAVE64-O0-NEXT: s_mov_b32 s16, s32 ; WAVE64-O0-NEXT: v_mov_b32_e32 v0, 17 ; WAVE64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s16 offset:4 +; WAVE64-O0-NEXT: ; implicit-def: $sgpr17 +; WAVE64-O0-NEXT: v_mov_b32_e32 v0, s17 +; WAVE64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s16 ; WAVE64-O0-NEXT: s_mov_b32 s18, stack_passed_argument@abs32@hi ; WAVE64-O0-NEXT: s_mov_b32 s16, stack_passed_argument@abs32@lo ; WAVE64-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17 @@ -1578,6 +1628,9 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() { ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s16, s32 ; WAVE32-WWM-PREALLOC-NEXT: v_mov_b32_e32 v0, 17 ; WAVE32-WWM-PREALLOC-NEXT: buffer_store_dword v0, off, s[0:3], s16 offset:4 +; WAVE32-WWM-PREALLOC-NEXT: ; implicit-def: $sgpr17 +; WAVE32-WWM-PREALLOC-NEXT: v_mov_b32_e32 v0, s17 +; WAVE32-WWM-PREALLOC-NEXT: buffer_store_dword v0, off, s[0:3], s16 ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s18, stack_passed_argument@abs32@hi ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s16, stack_passed_argument@abs32@lo ; WAVE32-WWM-PREALLOC-NEXT: ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17 @@ -1674,5 +1727,6 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() { ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GCN: {{.*}} ; WAVE32: {{.*}} ; WAVE64: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/trap-abis.ll b/llvm/test/CodeGen/AMDGPU/trap-abis.ll index 9c0beb2ed358..822f4a2cd8c6 100644 --- a/llvm/test/CodeGen/AMDGPU/trap-abis.ll +++ b/llvm/test/CodeGen/AMDGPU/trap-abis.ll @@ -79,6 +79,8 @@ define amdgpu_kernel void @trap(ptr addrspace(1) nocapture readonly %arg0) { ; HSA-TRAP-GFX1100-O0-LABEL: trap: ; HSA-TRAP-GFX1100-O0: ; %bb.0: ; HSA-TRAP-GFX1100-O0-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt lgkmcnt(0) +; HSA-TRAP-GFX1100-O0-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; HSA-TRAP-GFX1100-O0-NEXT: v_mov_b32_e32 v0, 0 ; HSA-TRAP-GFX1100-O0-NEXT: v_mov_b32_e32 v1, 1 ; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt lgkmcnt(0) @@ -212,6 +214,8 @@ define amdgpu_kernel void @non_entry_trap(ptr addrspace(1) nocapture readonly %a ; HSA-TRAP-GFX1100-O0: ; %bb.0: ; %entry ; HSA-TRAP-GFX1100-O0-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt lgkmcnt(0) +; HSA-TRAP-GFX1100-O0-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt lgkmcnt(0) ; HSA-TRAP-GFX1100-O0-NEXT: s_mov_b64 s[2:3], s[0:1] ; HSA-TRAP-GFX1100-O0-NEXT: ; implicit-def: $vgpr2 : SGPR spill to VGPR lane ; HSA-TRAP-GFX1100-O0-NEXT: v_writelane_b32 v2, s2, 0 @@ -355,8 +359,12 @@ define amdgpu_kernel void @trap_with_use_after(ptr addrspace(1) %arg0, ptr addrs ; ; HSA-TRAP-GFX1100-O0-LABEL: trap_with_use_after: ; HSA-TRAP-GFX1100-O0: ; %bb.0: +; HSA-TRAP-GFX1100-O0-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt lgkmcnt(0) +; HSA-TRAP-GFX1100-O0-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; HSA-TRAP-GFX1100-O0-NEXT: v_mov_b32_e32 v0, 0 ; HSA-TRAP-GFX1100-O0-NEXT: scratch_store_b32 off, v0, off offset:8 ; 4-byte Folded Spill +; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt lgkmcnt(0) ; HSA-TRAP-GFX1100-O0-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; HSA-TRAP-GFX1100-O0-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; HSA-TRAP-GFX1100-O0-NEXT: ; implicit-def: $vgpr2 : SGPR spill to VGPR lane @@ -477,6 +485,8 @@ define amdgpu_kernel void @debugtrap(ptr addrspace(1) nocapture readonly %arg0) ; HSA-TRAP-GFX1100-O0-LABEL: debugtrap: ; HSA-TRAP-GFX1100-O0: ; %bb.0: ; HSA-TRAP-GFX1100-O0-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt lgkmcnt(0) +; HSA-TRAP-GFX1100-O0-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; HSA-TRAP-GFX1100-O0-NEXT: v_mov_b32_e32 v0, 0 ; HSA-TRAP-GFX1100-O0-NEXT: v_mov_b32_e32 v1, 1 ; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-limit-gfx1250.ll b/llvm/test/CodeGen/AMDGPU/vgpr-limit-gfx1250.ll index 85904b7d5f83..2b1309492b73 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-limit-gfx1250.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-limit-gfx1250.ll @@ -558,8 +558,8 @@ define amdgpu_kernel void @k256_w1_asm() #2561 { } ; GCN-LABEL: {{^}}use512vgprs_codegen: -; GFX1250: NumVgprs: 512 -; GFX1250: VGPRBlocks: 31 +; GFX1250: NumVgprs: 482 +; GFX1250: VGPRBlocks: 30 define amdgpu_kernel void @use512vgprs_codegen(ptr %p) #2561 { %r0 = load volatile <512 x float>, ptr %p, align 1 store volatile <512 x float> %r0, ptr %p @@ -567,8 +567,8 @@ define amdgpu_kernel void @use512vgprs_codegen(ptr %p) #2561 { } ; GCN-LABEL: {{^}}use1024vgprs_codegen: -; GFX1250: NumVgprs: 1024 -; GFX1250: VGPRBlocks: 63 +; GFX1250: NumVgprs: 998 +; GFX1250: VGPRBlocks: 62 define amdgpu_kernel void @use1024vgprs_codegen(ptr %p) #1281 { %r0 = load volatile <1024 x float>, ptr %p, align 1 store volatile <1024 x float> %r0, ptr %p diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll b/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll index c40ba2be882b..991681d4f288 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll @@ -46,7 +46,9 @@ define amdgpu_kernel void @__omp_offloading_16_dd2df_main_l9() { ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_and_b32_e64 v0, s4, v0 ; CHECK-NEXT: s_mov_b32 s4, 0 -; CHECK-NEXT: v_cmp_ne_u32_e64 s[4:5], v0, s4 +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v0, s4 +; CHECK-NEXT: s_mov_b64 s[6:7], -1 +; CHECK-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7] ; CHECK-NEXT: s_and_b64 vcc, exec, s[4:5] ; CHECK-NEXT: s_cbranch_vccnz .LBB0_4 ; CHECK-NEXT: ; %bb.3: ; %bb201 diff --git a/llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll b/llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll index 16e0a34376b2..7a8d66ba84b7 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll @@ -23,6 +23,8 @@ define protected amdgpu_kernel void @kern(ptr %addr) !llvm.amdgcn.lds.kernel.id ; CHECK-NEXT: v_readlane_b32 s14, v40, 0 ; CHECK-NEXT: s_mov_b64 s[16:17], s[8:9] ; CHECK-NEXT: s_load_dwordx2 s[8:9], s[16:17], 0x0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_load_dwordx2 s[8:9], s[16:17], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v5, 42 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v3, s8 diff --git a/llvm/test/CodeGen/AMDGPU/wait-xcnt-atomic-rmw-optimization.ll b/llvm/test/CodeGen/AMDGPU/wait-xcnt-atomic-rmw-optimization.ll index 0f6edafb2290..04fad1cb9164 100644 --- a/llvm/test/CodeGen/AMDGPU/wait-xcnt-atomic-rmw-optimization.ll +++ b/llvm/test/CodeGen/AMDGPU/wait-xcnt-atomic-rmw-optimization.ll @@ -7,6 +7,8 @@ define amdgpu_kernel void @single_atomic_rmw(ptr addrspace(1) %ptr) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 1 ; GFX1250-NEXT: s_wait_loadcnt 0x0 @@ -32,6 +34,8 @@ define amdgpu_kernel void @atomic_rmw_back_to_back(ptr addrspace(1) %ptr) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 1 ; GFX1250-NEXT: s_wait_loadcnt 0x0 @@ -80,7 +84,13 @@ define amdgpu_kernel void @atomic_rmw_with_alu(ptr addrspace(1) %ptr, i32 %a, i3 ; GFX1250-LABEL: atomic_rmw_with_alu: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0xc nv @@ -135,7 +145,11 @@ define amdgpu_kernel void @atomic_rmw_broken_by_global_load(ptr addrspace(1) %pt ; GFX1250-LABEL: atomic_rmw_broken_by_global_load: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v1, 1 @@ -187,7 +201,11 @@ define amdgpu_kernel void @atomic_rmw_broken_by_global_store(ptr addrspace(1) %p ; GFX1250-LABEL: atomic_rmw_broken_by_global_store: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v1, 1 @@ -242,7 +260,11 @@ define amdgpu_kernel void @atomic_rmw_broken_by_flat_load(ptr addrspace(1) %ptr, ; GFX1250-LABEL: atomic_rmw_broken_by_flat_load: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v1, 1 @@ -293,7 +315,11 @@ define amdgpu_kernel void @atomic_rmw_broken_by_flat_store(ptr addrspace(1) %ptr ; GFX1250-LABEL: atomic_rmw_broken_by_flat_store: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v1, 1 @@ -347,7 +373,11 @@ define amdgpu_kernel void @atomic_rmw_broken_by_smem_load(ptr addrspace(1) %ptr, ; GFX1250-LABEL: atomic_rmw_broken_by_smem_load: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v1, 1 @@ -401,7 +431,11 @@ define amdgpu_kernel void @atomic_rmw_broken_by_atomic_store(ptr addrspace(1) %p ; GFX1250-LABEL: atomic_rmw_broken_by_atomic_store: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v1, 1 @@ -462,7 +496,11 @@ define amdgpu_kernel void @atomic_rmw_with_lds_load(ptr addrspace(1) %ptr, ptr a ; GFX1250-LABEL: atomic_rmw_with_lds_load: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v1, 1 @@ -514,7 +552,11 @@ define amdgpu_kernel void @atomic_rmw_with_lds_store(ptr addrspace(1) %ptr, ptr ; GFX1250-LABEL: atomic_rmw_with_lds_store: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v1, 1 @@ -568,7 +610,11 @@ define amdgpu_kernel void @atomic_rmw_with_flat_lds_load(ptr addrspace(1) %ptr, ; GFX1250-LABEL: atomic_rmw_with_flat_lds_load: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v1, 1 @@ -593,14 +639,17 @@ define amdgpu_kernel void @atomic_rmw_with_flat_lds_load(ptr addrspace(1) %ptr, ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s5, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s4, s7 ; GFX1250-NEXT: s_mov_b64 s[8:9], src_shared_base ; GFX1250-NEXT: s_mov_b32 s3, s9 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s5 +; GFX1250-NEXT: s_and_b32 s8, s5, exec_lo ; GFX1250-NEXT: s_cselect_b32 s4, s3, s4 ; GFX1250-NEXT: s_mov_b32 s3, s6 +; GFX1250-NEXT: s_and_b32 s5, s5, exec_lo ; GFX1250-NEXT: s_cselect_b32 s2, s2, s3 ; GFX1250-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 ; GFX1250-NEXT: s_mov_b32 s3, s4 @@ -631,7 +680,11 @@ define amdgpu_kernel void @atomic_rmw_with_flat_lds_store(ptr addrspace(1) %ptr, ; GFX1250-LABEL: atomic_rmw_with_flat_lds_store: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v1, 1 @@ -656,14 +709,17 @@ define amdgpu_kernel void @atomic_rmw_with_flat_lds_store(ptr addrspace(1) %ptr, ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s5, -1, 0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s4, s7 ; GFX1250-NEXT: s_mov_b64 s[8:9], src_shared_base ; GFX1250-NEXT: s_mov_b32 s3, s9 -; GFX1250-NEXT: s_mov_b32 s5, -1 -; GFX1250-NEXT: s_cmp_lg_u32 s2, s5 +; GFX1250-NEXT: s_and_b32 s8, s5, exec_lo ; GFX1250-NEXT: s_cselect_b32 s4, s3, s4 ; GFX1250-NEXT: s_mov_b32 s3, s6 +; GFX1250-NEXT: s_and_b32 s5, s5, exec_lo ; GFX1250-NEXT: s_cselect_b32 s2, s2, s3 ; GFX1250-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 ; GFX1250-NEXT: s_mov_b32 s3, s4 @@ -698,7 +754,13 @@ define amdgpu_kernel void @atomic_rmw_borken_by_async_lds_copy(ptr addrspace(1) ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX1250-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[2:3], 0x8 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[2:3], 0x10 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 nv ; GFX1250-NEXT: s_load_b64 s[4:5], s[2:3], 0x8 nv ; GFX1250-NEXT: s_wait_xcnt 0x0 @@ -758,7 +820,11 @@ define amdgpu_kernel void @multiple_atomic_rmw_blocks(ptr addrspace(1) %ptr1, pt ; GFX1250-LABEL: multiple_atomic_rmw_blocks: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 nv ; GFX1250-NEXT: v_mov_b32_e32 v1, 1 @@ -823,6 +889,8 @@ define amdgpu_kernel void @different_atomic_ops(ptr addrspace(1) %ptr) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 1 ; GFX1250-NEXT: s_wait_loadcnt 0x0 @@ -883,9 +951,13 @@ define amdgpu_kernel void @atomic_rmw_across_basic_blocks(ptr addrspace(1) %ptr, ; GFX1250-LABEL: atomic_rmw_across_basic_blocks: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX1250-NEXT: ; implicit-def: $vgpr2 : SGPR spill to VGPR lane ; GFX1250-NEXT: v_writelane_b32 v2, s4, 0 @@ -917,8 +989,13 @@ define amdgpu_kernel void @atomic_rmw_across_basic_blocks(ptr addrspace(1) %ptr, ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: s_cmp_lg_u32 s0, s1 -; GFX1250-NEXT: s_cbranch_scc1 .LBB16_2 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_eq_u32 s0, s1 +; GFX1250-NEXT: s_cselect_b32 s0, -1, 0 +; GFX1250-NEXT: s_mov_b32 s1, -1 +; GFX1250-NEXT: s_xor_b32 s0, s0, s1 +; GFX1250-NEXT: s_and_b32 vcc_lo, exec_lo, s0 +; GFX1250-NEXT: s_cbranch_vccnz .LBB16_2 ; GFX1250-NEXT: ; %bb.1: ; %then ; GFX1250-NEXT: s_or_saveexec_b32 s6, -1 ; GFX1250-NEXT: scratch_load_b32 v2, off, off nv ; 4-byte Folded Reload @@ -963,6 +1040,9 @@ define amdgpu_kernel void @atomic_rmw_in_loop(ptr addrspace(1) %ptr, i32 %n) { ; GFX1250-LABEL: atomic_rmw_in_loop: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: ; implicit-def: $vgpr2 : SGPR spill to VGPR lane @@ -1011,14 +1091,17 @@ define amdgpu_kernel void @atomic_rmw_in_loop(ptr addrspace(1) %ptr, i32 %n) { ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_mov_b32 s2, 1 ; GFX1250-NEXT: s_add_co_i32 s0, s0, s2 -; GFX1250-NEXT: s_cmp_lt_u32 s0, s1 +; GFX1250-NEXT: s_cmp_ge_u32 s0, s1 +; GFX1250-NEXT: s_cselect_b32 s1, -1, 0 +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_xor_b32 s1, s1, s2 +; GFX1250-NEXT: s_and_b32 vcc_lo, exec_lo, s1 ; GFX1250-NEXT: v_writelane_b32 v2, s0, 3 -; GFX1250-NEXT: s_mov_b32 s6, exec_lo -; GFX1250-NEXT: s_mov_b32 exec_lo, -1 +; GFX1250-NEXT: s_or_saveexec_b32 s6, -1 ; GFX1250-NEXT: scratch_store_b32 off, v2, off nv ; 4-byte Folded Spill ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b32 exec_lo, s6 -; GFX1250-NEXT: s_cbranch_scc1 .LBB17_1 +; GFX1250-NEXT: s_cbranch_vccnz .LBB17_1 ; GFX1250-NEXT: ; %bb.2: ; %exit ; GFX1250-NEXT: s_endpgm entry: @@ -1043,6 +1126,9 @@ define amdgpu_kernel void @atomic_rmw_with_branch(ptr addrspace(1) %ptr, i32 %co ; GFX1250-LABEL: atomic_rmw_with_branch: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -1089,9 +1175,8 @@ define amdgpu_kernel void @atomic_rmw_with_branch(ptr addrspace(1) %ptr, i32 %co ; GFX1250-NEXT: s_mov_b32 exec_lo, s6 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_readlane_b32 s0, v2, 2 -; GFX1250-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX1250-NEXT: s_mov_b32 s0, 1 -; GFX1250-NEXT: v_cmp_ne_u32_e64 s0, v0, s0 +; GFX1250-NEXT: s_mov_b32 s1, -1 +; GFX1250-NEXT: s_xor_b32 s0, s0, s1 ; GFX1250-NEXT: s_and_b32 vcc_lo, exec_lo, s0 ; GFX1250-NEXT: s_cbranch_vccnz .LBB18_4 ; GFX1250-NEXT: ; %bb.2: ; %bb1 @@ -1194,6 +1279,8 @@ define amdgpu_kernel void @atomic_rmw_fallthrough(ptr addrspace(1) %ptr) { ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[2:3], s[0:1] ; GFX1250-NEXT: ; implicit-def: $vgpr2 : SGPR spill to VGPR lane ; GFX1250-NEXT: v_writelane_b32 v2, s2, 0 diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll index e9a0671ead4e..cfebf404fe92 100644 --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll @@ -53,7 +53,7 @@ define amdgpu_gfx void @strict_wwm_no_cfg(ptr addrspace(8) inreg %tmp14) { ; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41] ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v0 ; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[40:41], v3, v4 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[40:41] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[40:41] ; GFX9-O0-NEXT: s_mov_b32 s35, 1 ; GFX9-O0-NEXT: v_lshlrev_b32_e64 v3, s35, v3 ; GFX9-O0-NEXT: s_mov_b32 s35, 2 @@ -220,7 +220,7 @@ define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) ; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[36:37], v0, v3 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[36:37] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[36:37] ; GFX9-O0-NEXT: s_mov_b32 s36, 1 ; GFX9-O0-NEXT: v_lshlrev_b32_e64 v0, s36, v0 ; GFX9-O0-NEXT: s_mov_b32 s36, 2 @@ -533,9 +533,9 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i ; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill @@ -590,9 +590,9 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i ; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v0, s[34:35] ; GFX9-O0-NEXT: v_mov_b32_e32 v8, v3 ; GFX9-O0-NEXT: v_mov_b32_e32 v9, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v8 ; GFX9-O0-NEXT: s_mov_b32 s34, 32 -; GFX9-O0-NEXT: v_lshrrev_b64 v[4:5], s34, v[8:9] -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v4 +; GFX9-O0-NEXT: v_lshrrev_b64 v[3:4], s34, v[8:9] ; GFX9-O0-NEXT: s_getpc_b64 s[34:35] ; GFX9-O0-NEXT: s_add_u32 s34, s34, strict_wwm_called_i64@gotpcrel32@lo+4 ; GFX9-O0-NEXT: s_addc_u32 s35, s35, strict_wwm_called_i64@gotpcrel32@hi+12 @@ -601,8 +601,8 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i ; GFX9-O0-NEXT: s_mov_b64 s[36:37], s[0:1] ; GFX9-O0-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-O0-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 ; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-O0-NEXT: v_readlane_b32 s34, v11, 4 @@ -630,8 +630,8 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i ; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload @@ -731,8 +731,11 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in ; GFX9-O0-NEXT: buffer_load_dwordx4 v[11:14], v0, s[36:39], s34 offen ; GFX9-O0-NEXT: buffer_load_dwordx2 v[4:5], v0, s[36:39], s34 offen offset:16 ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v12 ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v12 +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v7 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[40:41], -1 ; GFX9-O0-NEXT: s_mov_b32 s35, 0x7fffffff ; GFX9-O0-NEXT: s_mov_b32 s44, -1 @@ -741,7 +744,10 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in ; GFX9-O0-NEXT: s_mov_b32 s42, s45 ; GFX9-O0-NEXT: ; implicit-def: $sgpr46_sgpr47 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s42 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[40:41] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[40:41] +; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41] +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: s_or_saveexec_b64 s[40:41], -1 ; GFX9-O0-NEXT: s_mov_b32 s35, s44 ; GFX9-O0-NEXT: ; implicit-def: $sgpr44_sgpr45 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, s35 @@ -750,12 +756,18 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in ; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41] ; GFX9-O0-NEXT: v_mov_b32_e32 v9, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v10, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v14 ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v13 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v14 +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v7 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[40:41], -1 ; GFX9-O0-NEXT: ; implicit-def: $sgpr44_sgpr45 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s42 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[40:41] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[40:41] +; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41] +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: s_or_saveexec_b64 s[40:41], -1 ; GFX9-O0-NEXT: ; implicit-def: $sgpr44_sgpr45 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, s35 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[40:41] @@ -1056,20 +1068,28 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt ; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:4 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v10 ; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX9-O0-NEXT: s_mov_b32 s34, 8 +; GFX9-O0-NEXT: v_add_u32_e64 v10, v1, s34 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v9 -; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:12 +; GFX9-O0-NEXT: buffer_store_dword v0, v10, s[0:3], 0 offen offset:4 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v8 ; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:8 +; GFX9-O0-NEXT: s_mov_b32 s34, 16 +; GFX9-O0-NEXT: v_add_u32_e64 v8, v1, s34 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v7 -; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:20 +; GFX9-O0-NEXT: buffer_store_dword v0, v8, s[0:3], 0 offen offset:4 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v6 ; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX9-O0-NEXT: s_mov_b32 s34, 24 +; GFX9-O0-NEXT: v_add_u32_e64 v6, v1, s34 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v5 -; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:28 +; GFX9-O0-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen offset:4 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:24 +; GFX9-O0-NEXT: s_mov_b32 s34, 32 +; GFX9-O0-NEXT: v_add_u32_e64 v4, v1, s34 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v3 -; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:36 +; GFX9-O0-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen offset:4 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:32 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, s4 diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll index 72672c8b6efa..8b3bf0290ec3 100644 --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll @@ -49,7 +49,7 @@ define amdgpu_cs void @no_cfg(ptr addrspace(8) inreg %tmp14) { ; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v0 ; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v3, v4 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[6:7] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[6:7] ; GFX9-O0-NEXT: s_mov_b32 s5, 1 ; GFX9-O0-NEXT: v_lshlrev_b32_e64 v3, s5, v3 ; GFX9-O0-NEXT: s_mov_b32 s5, 2 @@ -197,7 +197,7 @@ define amdgpu_cs void @cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) { ; GFX9-O0-NEXT: buffer_load_dword v3, off, s[16:19], 0 offset:4 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], v0, v3 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] ; GFX9-O0-NEXT: s_mov_b32 s4, 1 ; GFX9-O0-NEXT: v_lshlrev_b32_e64 v0, s4, v0 ; GFX9-O0-NEXT: s_mov_b32 s4, 2 @@ -330,12 +330,20 @@ define amdgpu_kernel void @call(ptr addrspace(8) %tmp14, i32 %arg) { ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v0 ; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-O0-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24 -; GFX9-O0-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x2c +; GFX9-O0-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-O0-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-O0-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-O0-NEXT: s_mov_b32 s2, 36 +; GFX9-O0-NEXT: s_load_dwordx2 s[8:9], s[0:1], s2 offset:0x8 +; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-O0-NEXT: s_mov_b32 s3, s9 ; GFX9-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9 +; GFX9-O0-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24 +; GFX9-O0-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-O0-NEXT: s_mov_b32 s9, s17 ; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 killed $sgpr16_sgpr17 ; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17_sgpr18_sgpr19 @@ -555,12 +563,20 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) %tmp14, i64 %arg) { ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0 ; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-O0-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-O0-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-O0-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-O0-NEXT: s_mov_b32 s8, 36 +; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-O0-NEXT: s_load_dwordx2 s[2:3], s[0:1], s8 offset:0x8 +; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-O0-NEXT: s_mov_b32 s8, s3 +; GFX9-O0-NEXT: s_mov_b32 s9, s2 ; GFX9-O0-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24 -; GFX9-O0-NEXT: s_load_dwordx2 s[18:19], s[0:1], 0x2c ; GFX9-O0-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-O0-NEXT: s_mov_b32 s8, s19 -; GFX9-O0-NEXT: s_mov_b32 s9, s18 ; GFX9-O0-NEXT: s_mov_b32 s15, s17 ; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 killed $sgpr16_sgpr17 ; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17_sgpr18_sgpr19 @@ -591,9 +607,10 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) %tmp14, i64 %arg) { ; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, v7, v0, s[2:3] ; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7 ; GFX9-O0-NEXT: v_mov_b32_e32 v10, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v9 ; GFX9-O0-NEXT: s_mov_b32 s2, 32 ; GFX9-O0-NEXT: v_lshrrev_b64 v[11:12], s2, v[9:10] -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v11 ; GFX9-O0-NEXT: s_mov_b64 s[8:9], 60 ; GFX9-O0-NEXT: s_mov_b32 s2, s0 ; GFX9-O0-NEXT: s_mov_b32 s0, s1 @@ -616,8 +633,8 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) %tmp14, i64 %arg) { ; GFX9-O0-NEXT: v_or3_b32 v3, v5, v4, v3 ; GFX9-O0-NEXT: ; implicit-def: $sgpr15 ; GFX9-O0-NEXT: v_mov_b32_e32 v31, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v7 ; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-O0-NEXT: v_readlane_b32 s0, v8, 4 @@ -723,8 +740,11 @@ define amdgpu_cs void @_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %index) { ; GFX9-O0-NEXT: buffer_load_dwordx4 v[11:14], v0, s[0:3], s4 offen ; GFX9-O0-NEXT: buffer_load_dwordx2 v[4:5], v0, s[0:3], s4 offen offset:16 ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v12 ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v12 +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v7 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-O0-NEXT: s_mov_b32 s5, 0x7fffffff ; GFX9-O0-NEXT: s_mov_b32 s10, -1 @@ -733,7 +753,10 @@ define amdgpu_cs void @_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %index) { ; GFX9-O0-NEXT: s_mov_b32 s8, s11 ; GFX9-O0-NEXT: ; implicit-def: $sgpr12_sgpr13 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[6:7] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[6:7] +; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-O0-NEXT: s_mov_b32 s5, s10 ; GFX9-O0-NEXT: ; implicit-def: $sgpr10_sgpr11 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, s5 @@ -742,12 +765,18 @@ define amdgpu_cs void @_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %index) { ; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v9, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v10, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v14 ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v13 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v14 +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v7 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-O0-NEXT: ; implicit-def: $sgpr10_sgpr11 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[6:7] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[6:7] +; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-O0-NEXT: ; implicit-def: $sgpr10_sgpr11 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[6:7] @@ -879,7 +908,7 @@ define amdgpu_cs void @strict_wwm_no_cfg(ptr addrspace(8) inreg %tmp14) { ; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v0 ; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v3, v4 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[6:7] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[6:7] ; GFX9-O0-NEXT: s_mov_b32 s5, 1 ; GFX9-O0-NEXT: v_lshlrev_b32_e64 v3, s5, v3 ; GFX9-O0-NEXT: s_mov_b32 s5, 2 @@ -1027,7 +1056,7 @@ define amdgpu_cs void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) { ; GFX9-O0-NEXT: buffer_load_dword v3, off, s[16:19], 0 offset:4 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], v0, v3 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] ; GFX9-O0-NEXT: s_mov_b32 s4, 1 ; GFX9-O0-NEXT: v_lshlrev_b32_e64 v0, s4, v0 ; GFX9-O0-NEXT: s_mov_b32 s4, 2 @@ -1160,12 +1189,20 @@ define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) %tmp14, i32 %arg) { ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v0 ; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-O0-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24 -; GFX9-O0-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x2c +; GFX9-O0-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-O0-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-O0-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-O0-NEXT: s_mov_b32 s2, 36 +; GFX9-O0-NEXT: s_load_dwordx2 s[8:9], s[0:1], s2 offset:0x8 +; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-O0-NEXT: s_mov_b32 s3, s9 ; GFX9-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9 +; GFX9-O0-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24 +; GFX9-O0-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-O0-NEXT: s_mov_b32 s9, s17 ; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 killed $sgpr16_sgpr17 ; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17_sgpr18_sgpr19 @@ -1385,12 +1422,20 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) %tmp14, i64 %arg ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0 ; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-O0-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-O0-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-O0-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-O0-NEXT: s_mov_b32 s8, 36 +; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-O0-NEXT: s_load_dwordx2 s[2:3], s[0:1], s8 offset:0x8 +; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-O0-NEXT: s_mov_b32 s8, s3 +; GFX9-O0-NEXT: s_mov_b32 s9, s2 ; GFX9-O0-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24 -; GFX9-O0-NEXT: s_load_dwordx2 s[18:19], s[0:1], 0x2c ; GFX9-O0-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-O0-NEXT: s_mov_b32 s8, s19 -; GFX9-O0-NEXT: s_mov_b32 s9, s18 ; GFX9-O0-NEXT: s_mov_b32 s15, s17 ; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 killed $sgpr16_sgpr17 ; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17_sgpr18_sgpr19 @@ -1421,9 +1466,10 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) %tmp14, i64 %arg ; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, v7, v0, s[2:3] ; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7 ; GFX9-O0-NEXT: v_mov_b32_e32 v10, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v9 ; GFX9-O0-NEXT: s_mov_b32 s2, 32 ; GFX9-O0-NEXT: v_lshrrev_b64 v[11:12], s2, v[9:10] -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v11 ; GFX9-O0-NEXT: s_mov_b64 s[8:9], 60 ; GFX9-O0-NEXT: s_mov_b32 s2, s0 ; GFX9-O0-NEXT: s_mov_b32 s0, s1 @@ -1446,8 +1492,8 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) %tmp14, i64 %arg ; GFX9-O0-NEXT: v_or3_b32 v3, v5, v4, v3 ; GFX9-O0-NEXT: ; implicit-def: $sgpr15 ; GFX9-O0-NEXT: v_mov_b32_e32 v31, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v7 ; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-O0-NEXT: v_readlane_b32 s0, v8, 4 @@ -1553,8 +1599,11 @@ define amdgpu_cs void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %ind ; GFX9-O0-NEXT: buffer_load_dwordx4 v[11:14], v0, s[0:3], s4 offen ; GFX9-O0-NEXT: buffer_load_dwordx2 v[4:5], v0, s[0:3], s4 offen offset:16 ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v12 ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v12 +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v7 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-O0-NEXT: s_mov_b32 s5, 0x7fffffff ; GFX9-O0-NEXT: s_mov_b32 s10, -1 @@ -1563,7 +1612,10 @@ define amdgpu_cs void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %ind ; GFX9-O0-NEXT: s_mov_b32 s8, s11 ; GFX9-O0-NEXT: ; implicit-def: $sgpr12_sgpr13 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[6:7] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[6:7] +; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-O0-NEXT: s_mov_b32 s5, s10 ; GFX9-O0-NEXT: ; implicit-def: $sgpr10_sgpr11 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, s5 @@ -1572,12 +1624,18 @@ define amdgpu_cs void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %ind ; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v9, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v10, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v14 ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v13 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v14 +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v7 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-O0-NEXT: ; implicit-def: $sgpr10_sgpr11 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[6:7] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[6:7] +; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-O0-NEXT: ; implicit-def: $sgpr10_sgpr11 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/zext-lid.ll b/llvm/test/CodeGen/AMDGPU/zext-lid.ll index 6fea05d8d740..395862c58f4e 100644 --- a/llvm/test/CodeGen/AMDGPU/zext-lid.ll +++ b/llvm/test/CodeGen/AMDGPU/zext-lid.ll @@ -2,7 +2,7 @@ ; RUN: llc -O0 -mtriple=amdgcn < %s | FileCheck -enable-var-scope -check-prefix=GCN %s ; GCN-LABEL: {{^}}zext_grp_size_128: -; GCN-NOT: and_b32 +; O2-NOT: and_b32 define amdgpu_kernel void @zext_grp_size_128(ptr addrspace(1) nocapture %arg) #0 { bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -20,7 +20,7 @@ bb: } ; GCN-LABEL: {{^}}zext_grp_size_32x4x1: -; GCN-NOT: and_b32 +; O2-NOT: and_b32 define amdgpu_kernel void @zext_grp_size_32x4x1(ptr addrspace(1) nocapture %arg) #0 !reqd_work_group_size !0 { bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -38,7 +38,7 @@ bb: } ; GCN-LABEL: {{^}}zext_grp_size_1x1x1: -; GCN-NOT: and_b32 +; O2-NOT: and_b32 ; When EarlyCSE is not run this call produces a range max with 0 active bits, ; which is a special case as an AssertZext from width 0 is invalid. @@ -50,7 +50,7 @@ define amdgpu_kernel void @zext_grp_size_1x1x1(ptr addrspace(1) nocapture %arg) } ; GCN-LABEL: {{^}}zext_grp_size_512: -; GCN-NOT: and_b32 +; O2-NOT: and_b32 define amdgpu_kernel void @zext_grp_size_512(ptr addrspace(1) nocapture %arg) #1 { bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()